Repository: speechandlanguageprocessing/ICASSP2022-Depression Branch: main Commit: eded8cc0818d Files: 17 Total size: 174.0 KB Directory structure: gitextract_kdhj1m2d/ ├── DepressionCollected/ │ ├── Classification/ │ │ ├── AudioModelChecking.py │ │ ├── AudioTraditionalClassifiers.py │ │ ├── FuseModelChecking.py │ │ ├── TextModelChecking.py │ │ ├── TextTraditionalClassifiers.py │ │ ├── audio_features_whole.py │ │ ├── audio_gru_whole.py │ │ ├── fuse_net_whole.py │ │ ├── text_bilstm_whole.py │ │ └── text_features_whole.py │ ├── DAICFeatureExtarction/ │ │ ├── feature_extraction.py │ │ └── queries.txt │ └── Regression/ │ ├── AudioModelChecking.py │ ├── audio_bilstm_perm.py │ ├── fuse_net.py │ └── text_bilstm_perm.py └── README.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: DepressionCollected/Classification/AudioModelChecking.py ================================================ import torch import torch.nn as nn from torch.autograd import Variable from torch.nn import functional as F import torch.optim as optim from sklearn.metrics import confusion_matrix import numpy as np import pandas as pd import wave import re import os import tensorflow.compat.v1 as tf import random import itertools from audio_gru_whole import AudioBiLSTM from sklearn.preprocessing import StandardScaler import pickle class BiLSTM(nn.Module): def __init__(self, rnn_layers, dropout, num_classes, audio_hidden_dims, audio_embed_size): super(BiLSTM, self).__init__() self.lstm_net_audio = nn.GRU(audio_embed_size, audio_hidden_dims, num_layers=rnn_layers, dropout=dropout, batch_first=True) self.fc_audio = nn.Sequential( nn.Dropout(dropout), nn.Linear(audio_hidden_dims, audio_hidden_dims), nn.ReLU(), nn.Dropout(dropout), nn.Linear(audio_hidden_dims, num_classes), # nn.ReLU(), nn.Softmax(dim=1) ) def forward(self, x): x, _ = self.lstm_net_audio(x) # x = self.bn(x) x = x.sum(dim=1) out = self.fc_audio(x) return out # prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) # audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/Audio/whole_samples_clf_avid256.npz'))['arr_0'], axis=2) # audio_targets = np.load(os.path.join(prefix, 'Features/Audio/whole_labels_clf_avid256.npz'))['arr_0'] prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2) audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0'] audio_dep_idxs = np.where(audio_targets == 1)[0] audio_non_idxs = np.where(audio_targets == 0)[0] def standard_confusion_matrix(y_test, y_test_pred): """ Make confusion matrix with format: ----------- | TP | FP | ----------- | FN | TN | ----------- Parameters ---------- y_true : ndarray - 1D y_pred : ndarray - 1D Returns ------- ndarray - 2D """ [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) return np.array([[tp, fp], [fn, tn]]) def model_performance(y_test, y_test_pred_proba): """ Evaluation metrics for network performance. """ # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] y_test_pred = y_test_pred_proba # Computing confusion matrix for test dataset conf_matrix = standard_confusion_matrix(y_test, y_test_pred) print("Confusion Matrix:") print(conf_matrix) return y_test_pred, conf_matrix config = { 'num_classes': 2, 'dropout': 0.5, 'rnn_layers': 2, 'embedding_size': 256, 'batch_size': 4, 'epochs': 100, 'learning_rate': 1e-5, 'hidden_dims': 256, 'bidirectional': False, 'cuda': False } # audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio/BiLSTM_gru_vlad256_256_0.80.pt')) # audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio3/BiLSTM_gru_vlad256_256_0.89.pt')) # audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio2/BiLSTM_gru_vlad256_256_0.65.pt')) # model = BiLSTM(config['rnn_layers'], config['dropout'], config['num_classes'], \ # config['hidden_dims'], config['embedding_size']) # model_state_dict = {} # model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0'] # model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0'] # model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0'] # model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0'] # model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1'] # model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1'] # model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1'] # model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1'] # model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight'] # model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias'] # model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight'] # model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias'] # model_state_dict = audio_lstm_model.state_dict() # model.load_state_dict(model_state_dict, strict=False) def evaluate(model, test_idxs): model.eval() batch_idx = 1 total_loss = 0 pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor) # X_test = audio_features[test_dep_idxs+test_non_idxs] # Y_test = audio_targets[test_dep_idxs+test_non_idxs] X_test = audio_features[test_idxs] Y_test = audio_targets[test_idxs] global max_train_acc, max_acc,max_f1 for i in range(0, X_test.shape[0], config['batch_size']): if i + config['batch_size'] > X_test.shape[0]: x, y = X_test[i:], Y_test[i:] else: x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])] if config['cuda']: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() else: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), Variable(torch.from_numpy(y)) with torch.no_grad(): output = model(x.squeeze(2)) pred = torch.cat((pred, output.data.max(1, keepdim=True)[1])) y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:]) print('Calculating additional test metrics...') accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) f1_score = 2 * (precision * recall) / (precision + recall) print("Accuracy: {}".format(accuracy)) print("Precision: {}".format(precision)) print("Recall: {}".format(recall)) print("F1-Score: {}\n".format(f1_score)) print('='*89) return precision, recall, f1_score # evaluate(audio_features_test, fuse_targets_test, audio_lstm_model) # evaluate(model) idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy'] audio_model_paths = ['BiLSTM_gru_vlad256_256_0.67_1.pt', 'BiLSTM_gru_vlad256_256_0.67_2.pt', 'BiLSTM_gru_vlad256_256_0.63_3.pt'] ps, rs, fs = [], [], [] for fold in range(3): train_idxs_tmp = np.load(os.path.join(prefix, 'Features/TextWhole/{}'.format(idxs_paths[fold])), allow_pickle=True) test_idxs_tmp = list(set(list(audio_dep_idxs)+list(audio_non_idxs)) - set(train_idxs_tmp)) audio_lstm_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Audio/{}'.format(audio_model_paths[fold]))) train_idxs, test_idxs = [], [] for idx in train_idxs_tmp: if idx in audio_dep_idxs: feat = audio_features[idx] count = 0 resample_idxs = [0,1,2,3,4,5] for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) audio_targets = np.hstack((audio_targets, 1)) train_idxs.append(len(audio_features)-1) count += 1 else: train_idxs.append(idx) for idx in test_idxs_tmp: if idx in audio_dep_idxs: feat = audio_features[idx] count = 0 # resample_idxs = random.sample(range(6), 4) resample_idxs = [0,1,4,5] for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) audio_targets = np.hstack((audio_targets, 1)) test_idxs.append(len(audio_features)-1) count += 1 else: test_idxs.append(idx) p, r, f = evaluate(audio_lstm_model, test_idxs) ps.append(p) rs.append(r) fs.append(f) print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs))) ================================================ FILE: DepressionCollected/Classification/AudioTraditionalClassifiers.py ================================================ from sklearn.model_selection import KFold import numpy as np import pandas as pd import os import pickle import random import itertools from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2) audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0'] audio_dep_idxs_tmp = np.where(audio_targets == 1)[0] audio_non_idxs = np.where(audio_targets == 0)[0] def model_performance(y_test, y_test_pred_proba): """ Evaluation metrics for network performance. """ # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] y_test_pred = y_test_pred_proba # Computing confusion matrix for test dataset conf_matrix = standard_confusion_matrix(y_test, y_test_pred) print("Confusion Matrix:") print(conf_matrix) return y_test_pred, conf_matrix def standard_confusion_matrix(y_test, y_test_pred): [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) return np.array([[tp, fp], [fn, tn]]) train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True), np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.65_2.npy'), allow_pickle=True), np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)] precs, recs, f1s = [], [], [] for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps): test_idxs_tmp = list(set(list(audio_dep_idxs_tmp)+list(audio_non_idxs)) - set(train_idxs_tmp)) train_idxs, test_idxs = [], [] # depression data augmentation for idx in train_idxs_tmp: if idx in audio_dep_idxs_tmp: feat = audio_features[idx] count = 0 resample_idxs = [0,1,2,3,4,5] for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) audio_targets = np.hstack((audio_targets, 1)) train_idxs.append(len(audio_features)-1) count += 1 else: train_idxs.append(idx) for idx in test_idxs_tmp: if idx in audio_dep_idxs_tmp: feat = audio_features[idx] count = 0 # resample_idxs = random.sample(range(6), 4) resample_idxs = [0,1,4,5] for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) audio_targets = np.hstack((audio_targets, 1)) test_idxs.append(len(audio_features)-1) count += 1 else: test_idxs.append(idx) X_train = audio_features[train_idxs] Y_train = audio_targets[train_idxs] X_test = audio_features[test_idxs] Y_test = audio_targets[test_idxs] # Decision Tree # from sklearn import tree # clf = tree.DecisionTreeClassifier(max_depth=20) # svm # from sklearn.svm import SVC # clf = SVC(kernel='sigmoid') # rf from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=50) # lr # from sklearn.linear_model import LogisticRegression # clf = LogisticRegression(solver='newton-cg') clf.fit([f.flatten() for f in X_train], Y_train) pred = clf.predict([f.flatten() for f in X_test]) # clf.fit([f.sum(axis=0) for f in X_train], Y_train) # pred = clf.predict([f.sum(axis=0) for f in X_test]) y_test_pred, conf_matrix = model_performance(Y_test, pred) # custom evaluation metrics print('Calculating additional test metrics...') accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) f1_score = 2 * (precision * recall) / (precision + recall) print("Accuracy: {}".format(accuracy)) print("Precision: {}".format(precision)) print("Recall: {}".format(recall)) print("F1-Score: {}\n".format(f1_score)) print('='*89) precs.append(0 if np.isnan(precision) else precision) recs.append(0 if np.isnan(recall) else recall) f1s.append(0 if np.isnan(f1_score) else f1_score) # precs.append(precision) # recs.append(recall) # f1s.append(f1_score) print(np.mean(precs), np.mean(recs), np.mean(f1s)) ================================================ FILE: DepressionCollected/Classification/FuseModelChecking.py ================================================ from fuse_net_whole import fusion_net, config, model_performance import os import numpy as np import torch from torch.autograd import Variable import itertools prefix = os.path.abspath(os.path.join(os.getcwd(), "./")) idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy'] text_model_paths = ['BiLSTM_128_0.67_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.66_3.pt'] audio_model_paths = ['BiLSTM_gru_vlad256_256_0.63_1.pt', 'BiLSTM_gru_vlad256_256_0.65_2.pt', 'BiLSTM_gru_vlad256_256_0.60_3.pt'] fuse_model_paths = ['fuse_0.69_1.pt', 'fuse_0.68_2.pt', 'fuse_0.62_3.pt'] text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0'] text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0'] audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2) audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0'] fuse_features = [[audio_features[i], text_features[i]] for i in range(text_features.shape[0])] fuse_targets = text_targets fuse_dep_idxs = np.where(text_targets == 1)[0] fuse_non_idxs = np.where(text_targets == 0)[0] def evaluate(model, test_idxs): model.eval() pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor) X_test = [] Y_test = [] for idx in test_idxs: X_test.append(fuse_features[idx]) Y_test.append(fuse_targets[idx]) global max_train_acc, max_acc,max_f1 for i in range(0, len(X_test), config['batch_size']): if i + config['batch_size'] > len(X_test): x, y = X_test[i:], Y_test[i:] else: x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])] if config['cuda']: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() text_feature, audio_feature = model.pretrained_feature(x) with torch.no_grad(): # concat_x = torch.cat((audio_feature, text_feature), dim=1) audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std() text_feature_norm = (text_feature - text_feature.mean())/text_feature.std() concat_x = torch.cat((text_feature, audio_feature), dim=1) output = model(concat_x) pred = torch.cat((pred, output.data.max(1, keepdim=True)[1])) y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:]) # custom evaluation metrics print('Calculating additional test metrics...') accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) f1_score = 2 * (precision * recall) / (precision + recall) print("Accuracy: {}".format(accuracy)) print("Precision: {}".format(precision)) print("Recall: {}".format(recall)) print("F1-Score: {}\n".format(f1_score)) print('='*89) return precision, recall, f1_score ps, rs, fs = [], [], [] for fold in range(3): train_idxs_tmp = np.load(os.path.join(prefix, 'Features/TextWhole/{}'.format(idxs_paths[fold])), allow_pickle=True) test_idxs_tmp = list(set(list(fuse_dep_idxs)+list(fuse_non_idxs)) - set(train_idxs_tmp)) resample_idxs = list(range(6)) train_idxs, test_idxs = [], [] # depression data augmentation for idx in train_idxs_tmp: if idx in fuse_dep_idxs: feat = fuse_features[idx] audio_perm = itertools.permutations(feat[0], 3) text_perm = itertools.permutations(feat[1], 3) count = 0 for fuse_perm in zip(audio_perm, text_perm): if count in resample_idxs: fuse_features.append(fuse_perm) fuse_targets = np.hstack((fuse_targets, 1)) train_idxs.append(len(fuse_features)-1) count += 1 else: train_idxs.append(idx) for idx in test_idxs_tmp: if idx in fuse_dep_idxs: feat = fuse_features[idx] audio_perm = itertools.permutations(feat[0], 3) text_perm = itertools.permutations(feat[1], 3) count = 0 resample_idxs = [0,1,4,5] for fuse_perm in zip(audio_perm, text_perm): if count in resample_idxs: fuse_features.append(fuse_perm) fuse_targets = np.hstack((fuse_targets, 1)) test_idxs.append(len(fuse_features)-1) count += 1 else: test_idxs.append(idx) fuse_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Fuse/{}'.format(fuse_model_paths[fold]))) p, r, f = evaluate(fuse_model, test_idxs) ps.append(p) rs.append(r) fs.append(f) print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs))) ================================================ FILE: DepressionCollected/Classification/TextModelChecking.py ================================================ import torch import torch.nn as nn from torch.autograd import Variable from torch.nn import functional as F import torch.optim as optim from sklearn.metrics import confusion_matrix import numpy as np import pandas as pd import wave import re import os import tensorflow.compat.v1 as tf import random import itertools from sklearn.preprocessing import StandardScaler import pickle # prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) # text_features = np.load(os.path.join(prefix, 'Features/Text/whole_samples_clf_avg.npz'))['arr_0'] # text_targets = np.load(os.path.join(prefix, 'Features/Text/whole_labels_clf_avg.npz'))['arr_0'] # audio_dep_idxs = np.where(text_targets == 1)[0] # audio_non_idxs = np.where(text_targets == 0)[0] # # train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.80.npy'), allow_pickle=True) # # train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.80.npy'), allow_pickle=True)) # # train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.65_2.npy'), allow_pickle=True) # # train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.65_2.npy'), allow_pickle=True)) # train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.89_3.npy'), allow_pickle=True) # train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.89_3.npy'), allow_pickle=True)) # test_dep_idxs_tmp = list(set(audio_dep_idxs) - set(train_dep_idxs_tmp)) # test_non_idxs = list(set(audio_non_idxs) - set(train_non_idxs)) prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) text_features = np.load(os.path.join( prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0'] text_targets = np.load(os.path.join( prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0'] text_dep_idxs_tmp = np.where(text_targets == 1)[0] text_non_idxs = np.where(text_targets == 0)[0] # # training data augmentation # train_dep_idxs = [] # for idx in train_dep_idxs_tmp: # feat = text_features[idx] # for i in itertools.permutations(feat, feat.shape[0]): # text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) # text_targets = np.hstack((text_targets, 1)) # train_dep_idxs.append(len(text_features)-1) # text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) # text_targets = np.hstack((text_targets, 1)) # train_dep_idxs.append(len(text_features)-1) # # test data augmentation # test_dep_idxs = [] # for idx in test_dep_idxs_tmp: # feat = text_features[idx] # for i in itertools.permutations(feat, feat.shape[0]): # text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) # text_targets = np.hstack((text_targets, 1)) # test_dep_idxs.append(len(text_features)-1) def standard_confusion_matrix(y_test, y_test_pred): """ Make confusion matrix with format: ----------- | TP | FP | ----------- | FN | TN | ----------- Parameters ---------- y_true : ndarray - 1D y_pred : ndarray - 1D Returns ------- ndarray - 2D """ [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) return np.array([[tp, fp], [fn, tn]]) def model_performance(y_test, y_test_pred_proba): """ Evaluation metrics for network performance. """ # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] y_test_pred = y_test_pred_proba # Computing confusion matrix for test dataset conf_matrix = standard_confusion_matrix(y_test, y_test_pred) print("Confusion Matrix:") print(conf_matrix) return y_test_pred, conf_matrix class TextBiLSTM(nn.Module): def __init__(self, config): super(TextBiLSTM, self).__init__() self.num_classes = config['num_classes'] self.learning_rate = config['learning_rate'] self.dropout = config['dropout'] self.hidden_dims = config['hidden_dims'] self.rnn_layers = config['rnn_layers'] self.embedding_size = config['embedding_size'] self.bidirectional = config['bidirectional'] self.build_model() self.init_weight() def init_weight(net): for name, param in net.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_uniform_(param) def build_model(self): # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(inplace=True) ) # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) # 双层lstm self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=self.bidirectional) # self.init_weight() # FC层 # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes) self.fc_out = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(), nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.num_classes), # nn.ReLU(), nn.Softmax(dim=1), ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # h = lstm_out # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def forward(self, x): # x : [len_seq, batch_size, embedding_dim] x = x.permute(1, 0, 2) output, (final_hidden_state, final_cell_state) = self.lstm_net(x) # output : [batch_size, len_seq, n_hidden * 2] output = output.permute(1, 0, 2) # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] final_hidden_state = final_hidden_state.permute(1, 0, 2) # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) # atten_out = self.attention_net(output, final_hidden_state) atten_out = self.attention_net_with_w(output, final_hidden_state) return self.fc_out(atten_out) class BiLSTM(nn.Module): def __init__(self, rnn_layers, dropout, num_classes, text_hidden_dims, text_embed_size): super(BiLSTM, self).__init__() self.text_embed_size = text_embed_size self.text_hidden_dims = text_hidden_dims self.rnn_layers = rnn_layers self.dropout = dropout self.num_classes = num_classes # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.text_hidden_dims, self.text_hidden_dims), nn.ReLU(inplace=True) ) # 双层lstm self.lstm_net = nn.LSTM(self.text_embed_size, self.text_hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=True) # FC层 self.fc_out = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.text_hidden_dims, self.text_hidden_dims), nn.ReLU(), nn.Dropout(self.dropout), nn.Linear(self.text_hidden_dims, self.num_classes), # nn.ReLU(), nn.Softmax(dim=1), ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def forward(self, x_text): # x : [len_seq, batch_size, embedding_dim] x_text = x_text.permute(1, 0, 2) output, (final_hidden_state, _) = self.lstm_net(x_text) # output : [batch_size, len_seq, n_hidden * 2] output = output.permute(1, 0, 2) # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] final_hidden_state = final_hidden_state.permute(1, 0, 2) # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) # atten_out = self.attention_net(output, final_hidden_state) atten_out = self.attention_net_with_w(output, final_hidden_state) text_feature = self.fc_out(atten_out) return text_feature def evaluate(model, test_idxs): model.eval() batch_idx = 1 total_loss = 0 pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor) # X_test = text_features[test_dep_idxs+test_non_idxs] # Y_test = text_targets[test_dep_idxs+test_non_idxs] X_test = text_features[test_idxs] Y_test = text_targets[test_idxs] global max_train_acc, max_acc, max_f1 for i in range(0, X_test.shape[0], config['batch_size']): if i + config['batch_size'] > X_test.shape[0]: x, y = X_test[i:], Y_test[i:] else: x, y = X_test[i:(i+config['batch_size']) ], Y_test[i:(i+config['batch_size'])] if config['cuda']: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda( ), Variable(torch.from_numpy(y)).cuda() else: x, y = Variable(torch.from_numpy(x).type( torch.FloatTensor), requires_grad=True), Variable(torch.from_numpy(y)) with torch.no_grad(): output = model(x.squeeze(2)) pred = torch.cat((pred, output.data.max(1, keepdim=True)[1])) y_test_pred, conf_matrix = model_performance( Y_test, pred[config['batch_size']:]) print('Calculating additional test metrics...') accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) precision = float(conf_matrix[0][0]) / \ (conf_matrix[0][0] + conf_matrix[0][1]) recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) f1_score = 2 * (precision * recall) / (precision + recall) print("Accuracy: {}".format(accuracy)) print("Precision: {}".format(precision)) print("Recall: {}".format(recall)) print("F1-Score: {}\n".format(f1_score)) print('='*89) return precision, recall, f1_score text_model_paths = ['BiLSTM_128_0.64_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.66_3.pt'] train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True), np.load(os.path.join( prefix, 'Features/TextWhole/train_idxs_0.60_2.npy'), allow_pickle=True), np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)] resample_idxs = [0, 1, 2, 3, 4, 5] fold = 1 ps, rs, fs = [], [], [] for idx_i, train_idxs_tmp in enumerate(train_idxs_tmps): test_idxs_tmp = list( set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp)) train_idxs, test_idxs = [], [] # depression data augmentation for idx in train_idxs_tmp: if idx in text_dep_idxs_tmp: feat = text_features[idx] count = 0 for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: text_features = np.vstack( (text_features, np.expand_dims(list(i), 0))) text_targets = np.hstack((text_targets, 1)) train_idxs.append(len(text_features)-1) count += 1 else: train_idxs.append(idx) for idx in test_idxs_tmp: if idx in text_dep_idxs_tmp: feat = text_features[idx] count = 0 # resample_idxs = random.sample(range(6), 4) resample_idxs = [0,1,4,5] for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: text_features = np.vstack( (text_features, np.expand_dims(list(i), 0))) text_targets = np.hstack((text_targets, 1)) test_idxs.append(len(text_features)-1) count += 1 else: test_idxs.append(idx) config = { 'num_classes': 2, 'dropout': 0.5, 'rnn_layers': 2, 'embedding_size': 1024, 'batch_size': 4, 'epochs': 100, 'learning_rate': 2e-5, 'hidden_dims': 128, 'bidirectional': True, 'cuda': False, } text_lstm_model = torch.load(os.path.join( prefix, 'Model/ClassificationWhole/Text/{}'.format(text_model_paths[idx_i]))) model = BiLSTM(config['rnn_layers'], config['dropout'], config['num_classes'], config['hidden_dims'], config['embedding_size']) # model_state_dict = {} # model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0'] # model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0'] # model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0'] # model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0'] # model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1'] # model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1'] # model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1'] # model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1'] # model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight'] # model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias'] # model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight'] # model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias'] # model_state_dict = text_lstm_model.state_dict() # model.load_state_dict(model_state_dict) # evaluate(text_features_test, fuse_targets_test, audio_lstm_model) # evaluate(model, test_idxs) p, r, f = evaluate(text_lstm_model, test_idxs) ps.append(p) rs.append(r) fs.append(f) print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs))) ================================================ FILE: DepressionCollected/Classification/TextTraditionalClassifiers.py ================================================ from sklearn.model_selection import KFold import numpy as np import pandas as pd import os import pickle import random import itertools from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0'] text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0'] text_dep_idxs_tmp = np.where(text_targets == 1)[0] text_non_idxs = np.where(text_targets == 0)[0] def model_performance(y_test, y_test_pred_proba): """ Evaluation metrics for network performance. """ # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] y_test_pred = y_test_pred_proba # Computing confusion matrix for test dataset conf_matrix = standard_confusion_matrix(y_test, y_test_pred) print("Confusion Matrix:") print(conf_matrix) return y_test_pred, conf_matrix def standard_confusion_matrix(y_test, y_test_pred): [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) return np.array([[tp, fp], [fn, tn]]) train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True), np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.65_2.npy'), allow_pickle=True), np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)] precs, recs, f1s = [], [], [] for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps): test_idxs_tmp = list(set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp)) train_idxs, test_idxs = [], [] # depression data augmentation for idx in train_idxs_tmp: if idx in text_dep_idxs_tmp: feat = text_features[idx] count = 0 resample_idxs = [0,1,2,3,4,5] for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) text_targets = np.hstack((text_targets, 1)) train_idxs.append(len(text_features)-1) count += 1 else: train_idxs.append(idx) for idx in test_idxs_tmp: if idx in text_dep_idxs_tmp: feat = text_features[idx] count = 0 # resample_idxs = random.sample(range(6), 4) resample_idxs = [0,1,4,5] for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) text_targets = np.hstack((text_targets, 1)) test_idxs.append(len(text_features)-1) count += 1 else: test_idxs.append(idx) # train_idxs = train_idxs_tmp # test_idxs = test_idxs_tmp X_train = text_features[train_idxs] Y_train = text_targets[train_idxs] X_test = text_features[test_idxs] Y_test = text_targets[test_idxs] # Decision Tree from sklearn import tree clf = tree.DecisionTreeClassifier(max_depth=20) # svm # from sklearn.svm import SVC # clf = SVC(kernel='rbf', gamma='auto') # rf # from sklearn.ensemble import RandomForestClassifier # clf = RandomForestClassifier(n_estimators=10, max_depth=20) # lr # from sklearn.linear_model import LogisticRegression # clf = LogisticRegression() clf.fit([f.flatten() for f in X_train], Y_train) pred = clf.predict([f.flatten() for f in X_test]) # clf.fit([f.sum(axis=0) for f in X_train], Y_train) # pred = clf.predict([f.sum(axis=0) for f in X_test]) y_test_pred, conf_matrix = model_performance(Y_test, pred) # custom evaluation metrics print('Calculating additional test metrics...') accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) f1_score = 2 * (precision * recall) / (precision + recall) print("Accuracy: {}".format(accuracy)) print("Precision: {}".format(precision)) print("Recall: {}".format(recall)) print("F1-Score: {}\n".format(f1_score)) print('='*89) # precs.append(0 if np.isnan(precision) else precision) # recs.append(0 if np.isnan(recall) else recall) # f1s.append(0 if np.isnan(f1_score) else f1_score) precs.append(precision) recs.append(recall) f1s.append(f1_score) print(np.mean(precs), np.mean(recs), np.mean(f1s)) ================================================ FILE: DepressionCollected/Classification/audio_features_whole.py ================================================ import os import numpy as np import pandas as pd import wave import librosa from python_speech_features import * import sys import pickle sys.path.append('/Users/linlin/Desktop/depression/classfication') import tensorflow.compat.v1 as tf import vggish.vggish_input as vggish_input import vggish.vggish_params as vggish_params import vggish.vggish_postprocess as vggish_postprocess import vggish.vggish_slim as vggish_slim import loupe_keras as lpk from allennlp.commands.elmo import ElmoEmbedder tf.enable_eager_execution() elmo = ElmoEmbedder() os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) # Paths to downloaded VGGish files. checkpoint_path =os.path.join(os.getcwd(), 'vggish/vggish_model.ckpt') pca_params_path = os.path.join(os.getcwd(), 'vggish/vggish_pca_params.npz') cluster_size = 16 min_len = 100 max_len = -1 def to_vggish_embedds(x, sr): # x为输入的音频,sr为sample_rate input_batch = vggish_input.waveform_to_examples(x, sr) with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch}) # Postprocess the results to produce whitened quantized embeddings. pproc = vggish_postprocess.Postprocessor(pca_params_path) postprocessed_batch = pproc.postprocess(embedding_batch) return tf.cast(postprocessed_batch, dtype='float32') def wav2vlad(wave_data, sr): global cluster_size signal = wave_data melspec = librosa.feature.melspectrogram(signal, n_mels=80,sr=sr).astype(np.float32).T melspec = np.log(np.maximum(1e-6, melspec)) feature_size = melspec.shape[1] max_samples = melspec.shape[0] output_dim = cluster_size * 16 feat = lpk.NetVLAD(feature_size=feature_size, max_samples=max_samples, \ cluster_size=cluster_size, output_dim=output_dim) \ (tf.convert_to_tensor(melspec)) with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) r = feat.numpy() return r def extract_features(number, audio_features, targets, path): global max_len, min_len if not os.path.exists(os.path.join(prefix, '{1}/{0}/positive_out.wav'.format(number, path))): return positive_file = wave.open(os.path.join(prefix, '{1}/{0}/positive_out.wav'.format(number, path))) sr1 = positive_file.getframerate() nframes1 = positive_file.getnframes() wave_data1 = np.frombuffer(positive_file.readframes(nframes1), dtype=np.short).astype(np.float) len1 = nframes1 / sr1 neutral_file = wave.open(os.path.join(prefix, '{1}/{0}/neutral_out.wav'.format(number, path))) sr2 = neutral_file.getframerate() nframes2 = neutral_file.getnframes() wave_data2 = np.frombuffer(neutral_file.readframes(nframes2), dtype=np.short).astype(np.float) len2 = nframes2 / sr2 negative_file = wave.open(os.path.join(prefix, '{1}/{0}/negative_out.wav'.format(number, path))) sr3 = negative_file.getframerate() nframes3 = negative_file.getnframes() wave_data3 = np.frombuffer(negative_file.readframes(nframes3), dtype=np.short).astype(np.float) len3 = nframes3/sr3 for l in [len1, len2, len3]: if l > max_len: max_len = l if l < min_len: min_len = l with open(os.path.join(prefix, '{1}/{0}/new_label.txt'.format(number, path))) as fli: target = float(fli.readline()) if wave_data1.shape[0] < 1: wave_data1 = np.array([1e-4]*sr1*5) if wave_data2.shape[0] < 1: wave_data2 = np.array([1e-4]*sr2*5) if wave_data3.shape[0] < 1: wave_data3 = np.array([1e-4]*sr3*5) audio_features.append([wav2vlad(wave_data1, sr1), wav2vlad(wave_data2, sr2), \ wav2vlad(wave_data3, sr3)]) # targets.append(1 if target >= 53 else 0) targets.append(target) audio_features = [] audio_targets = [] for index in range(114): extract_features(index+1, audio_features, audio_targets, 'Data') for index in range(114): extract_features(index+1, audio_features, audio_targets, 'ValidationData') print("Saving npz file locally...") np.savez(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_%d.npz'%(cluster_size*16)), audio_features) np.savez(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_%d.npz')%(cluster_size*16), audio_targets) print(max_len, min_len) ================================================ FILE: DepressionCollected/Classification/audio_gru_whole.py ================================================ import torch import torch.nn as nn from torch.autograd import Variable from torch.nn import functional as F import torch.optim as optim from sklearn.metrics import confusion_matrix from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold import numpy as np import pandas as pd import os import pickle import random import itertools prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2) audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0'] audio_dep_idxs_tmp = np.where(audio_targets == 1)[0] audio_non_idxs = np.where(audio_targets == 0)[0] class AudioBiLSTM(nn.Module): def __init__(self, config): super(AudioBiLSTM, self).__init__() self.num_classes = config['num_classes'] self.learning_rate = config['learning_rate'] self.dropout = config['dropout'] self.hidden_dims = config['hidden_dims'] self.rnn_layers = config['rnn_layers'] self.embedding_size = config['embedding_size'] self.bidirectional = config['bidirectional'] self.build_model() # self.init_weight() def init_weight(net): for name, param in net.named_parameters(): if not 'ln' in name: if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_uniform_(param) def build_model(self): # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(inplace=True)) # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) # self.lstm_net_audio = nn.LSTM(self.embedding_size, # self.hidden_dims, # num_layers=self.rnn_layers, # dropout=self.dropout, # bidirectional=self.bidirectional, # batch_first=True) self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True) self.ln = nn.LayerNorm(self.embedding_size) # FC层 self.fc_audio = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(), nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.num_classes), # nn.ReLU(), nn.Softmax(dim=1) ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # h = lstm_out # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] # print(atten_w.shape, m.transpose(1, 2).shape) atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def forward(self, x): x = self.ln(x) x, _ = self.lstm_net_audio(x) x = x.mean(dim=1) out = self.fc_audio(x) return out config = { 'num_classes': 2, 'dropout': 0.5, 'rnn_layers': 2, 'embedding_size': 256, 'batch_size': 8, 'epochs': 170, 'learning_rate': 6e-6, 'hidden_dims': 256, 'bidirectional': False, 'cuda': False } def save(model, filename): save_filename = '{}.pt'.format(filename) torch.save(model, save_filename) print('Saved as %s' % save_filename) def standard_confusion_matrix(y_test, y_test_pred): """ Make confusion matrix with format: ----------- | TP | FP | ----------- | FN | TN | ----------- Parameters ---------- y_true : ndarray - 1D y_pred : ndarray - 1D Returns ------- ndarray - 2D """ [[tn, fp], [fn, tp]] = confusion_matrix(y_test.cpu().numpy(), y_test_pred) return np.array([[tp, fp], [fn, tn]]) def model_performance(y_test, y_test_pred_proba): """ Evaluation metrics for network performance. """ y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] # Computing confusion matrix for test dataset conf_matrix = standard_confusion_matrix(y_test, y_test_pred.numpy()) print("Confusion Matrix:") print(conf_matrix) return y_test_pred, conf_matrix def train(epoch, train_idxs): global lr, train_acc model.train() batch_idx = 1 total_loss = 0 correct = 0 pred = np.array([]) X_train = audio_features[train_idxs] Y_train = audio_targets[train_idxs] for i in range(0, X_train.shape[0], config['batch_size']): if i + config['batch_size'] > X_train.shape[0]: x, y = X_train[i:], Y_train[i:] else: x, y = X_train[i:(i + config['batch_size'])], Y_train[i:( i + config['batch_size'])] if config['cuda']: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() else: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \ Variable(torch.from_numpy(y)) # 将模型的参数梯度设置为0 optimizer.zero_grad() output = model(x) pred = output.data.max(1, keepdim=True)[1] #print(pred.shape, y.shape) correct += pred.eq(y.data.view_as(pred)).cpu().sum() loss = criterion(output, y) # 后向传播调整参数 loss.backward() # 根据梯度更新网络参数 optimizer.step() batch_idx += 1 # loss.item()能够得到张量中的元素值 total_loss += loss.item() train_acc = correct print( 'Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n ' .format(epoch + 1, config['learning_rate'], total_loss, correct, X_train.shape[0], 100. * correct / X_train.shape[0])) def evaluate(model, test_idxs, fold, train_idxs_tmp, train_idxs): model.eval() batch_idx = 1 total_loss = 0 global max_f1, max_acc, min_mae, X_test_lens, max_prec, max_rec pred = np.array([]) with torch.no_grad(): if config['cuda']: x, y = Variable(torch.from_numpy(audio_features[test_idxs]).type(torch.FloatTensor), requires_grad=True).cuda(),\ Variable(torch.from_numpy(audio_targets[test_idxs])).cuda() else: x, y = Variable(torch.from_numpy(audio_features[test_idxs]).type(torch.FloatTensor), requires_grad=True), \ Variable(torch.from_numpy(audio_targets[test_idxs])).type(torch.LongTensor) optimizer.zero_grad() output = model(x) loss = criterion(output, y) total_loss += loss.item() y_test_pred, conf_matrix = model_performance(y, output.cpu()) accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) f1_score = 2 * (precision * recall) / (precision + recall) print("Accuracy: {}".format(accuracy)) print("Precision: {}".format(precision)) print("Recall: {}".format(recall)) print("F1-Score: {}\n".format(f1_score)) print('=' * 89) if max_f1 <= f1_score and train_acc > len(train_idxs)*0.90 and f1_score > 0.5: max_f1 = f1_score max_acc = accuracy max_rec = recall max_prec = precision mode ='gru' save(model, os.path.join(prefix, 'Model/ClassificationWhole/Audio/BiLSTM_{}_vlad{}_{}_{:.2f}_{}'.format(mode, config['embedding_size'], config['hidden_dims'], max_f1, fold))) np.save(os.path.join(prefix, 'Features/TextWhole/train_idxs_{:.2f}_{}.npy'.format(f1_score, fold)), train_idxs_tmp) print('*' * 64) print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc)) print('*' * 64) return total_loss def get_param_group(model): nd_list = [] param_list = [] for name, param in model.named_parameters(): if 'ln' in name: nd_list.append(param) else: param_list.append(param) return [{'params': param_list, 'weight_decay': 1e-5}, {'params': nd_list, 'weight_decay': 0}] if __name__ == '__main__': # kf = KFold(n_splits=3, shuffle=True) # fold = 1 # for train_idxs_tmp, test_idxs_tmp in kf.split(audio_features): train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True), np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_2.npy'), allow_pickle=True), np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)] for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps): fold = idx_idx + 1 # if idx_idx != 1: # continue test_idxs_tmp = list(set(list(audio_dep_idxs_tmp)+list(audio_non_idxs)) - set(train_idxs_tmp)) train_idxs, test_idxs = [], [] resample_idxs = [0,1,2,3,4,5] # depression data augmentation for idx in train_idxs_tmp: if idx in audio_dep_idxs_tmp: feat = audio_features[idx] count = 0 for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) audio_targets = np.hstack((audio_targets, 1)) train_idxs.append(len(audio_features)-1) count += 1 else: train_idxs.append(idx) for idx in test_idxs_tmp: if idx in audio_dep_idxs_tmp: feat = audio_features[idx] count = 0 # resample_idxs = random.sample(range(6), 4) resample_idxs = [0,1,4,5] for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) audio_targets = np.hstack((audio_targets, 1)) test_idxs.append(len(audio_features)-1) count += 1 else: test_idxs.append(idx) # test_idxs.append(idx) model = AudioBiLSTM(config) if config['cuda']: model = model.cuda() param_group = get_param_group(model) optimizer = optim.AdamW(param_group, lr=config['learning_rate']) criterion = nn.CrossEntropyLoss() # criterion = FocalLoss(class_num=2) max_f1 = -1 max_acc = -1 max_rec = -1 max_prec = -1 train_acc = -1 for ep in range(1, config['epochs']): train(ep, train_idxs) tloss = evaluate(model, test_idxs, fold, train_idxs_tmp, train_idxs) fold += 1 ================================================ FILE: DepressionCollected/Classification/fuse_net_whole.py ================================================ import torch import torch.nn as nn from torch.autograd import Variable from torch.nn import functional as F import torch.optim as optim from sklearn.metrics import confusion_matrix import numpy as np import pandas as pd import wave import librosa from python_speech_features import * import re from allennlp.commands.elmo import ElmoEmbedder import os import tensorflow.compat.v1 as tf import itertools prefix = os.path.abspath(os.path.join(os.getcwd(), "./")) text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0'] text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0'] audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2) audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0'] fuse_features = [[audio_features[i], text_features[i]] for i in range(text_features.shape[0])] fuse_targets = text_targets fuse_dep_idxs = np.where(text_targets == 1)[0] fuse_non_idxs = np.where(text_targets == 0)[0] def save(model, filename): save_filename = '{}.pt'.format(filename) torch.save(model, save_filename) print('Saved as %s' % save_filename) def standard_confusion_matrix(y_test, y_test_pred): """ Make confusion matrix with format: ----------- | TP | FP | ----------- | FN | TN | ----------- Parameters ---------- y_true : ndarray - 1D y_pred : ndarray - 1D Returns ------- ndarray - 2D """ [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) return np.array([[tp, fp], [fn, tn]]) def model_performance(y_test, y_test_pred_proba): """ Evaluation metrics for network performance. """ # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] y_test_pred = y_test_pred_proba # Computing confusion matrix for test dataset conf_matrix = standard_confusion_matrix(y_test, y_test_pred) print("Confusion Matrix:") print(conf_matrix) return y_test_pred, conf_matrix class TextBiLSTM(nn.Module): def __init__(self, config): super(TextBiLSTM, self).__init__() self.num_classes = config['num_classes'] self.learning_rate = config['learning_rate'] self.dropout = config['dropout'] self.hidden_dims = config['hidden_dims'] self.rnn_layers = config['rnn_layers'] self.embedding_size = config['embedding_size'] self.bidirectional = config['bidirectional'] self.build_model() self.init_weight() def init_weight(net): for name, param in net.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_uniform_(param) def build_model(self): # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(inplace=True) ) # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) # 双层lstm self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=self.bidirectional) # self.init_weight() # FC层 # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes) self.fc_out = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(), nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.num_classes), # nn.ReLU(), nn.Softmax(dim=1), ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # h = lstm_out # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def forward(self, x): # x : [len_seq, batch_size, embedding_dim] x = x.permute(1, 0, 2) output, (final_hidden_state, final_cell_state) = self.lstm_net(x) # output : [batch_size, len_seq, n_hidden * 2] output = output.permute(1, 0, 2) # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] final_hidden_state = final_hidden_state.permute(1, 0, 2) # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) # atten_out = self.attention_net(output, final_hidden_state) atten_out = self.attention_net_with_w(output, final_hidden_state) return self.fc_out(atten_out) class AudioBiLSTM(nn.Module): def __init__(self, config): super(AudioBiLSTM, self).__init__() self.num_classes = config['num_classes'] self.learning_rate = config['learning_rate'] self.dropout = config['dropout'] self.hidden_dims = config['hidden_dims'] self.rnn_layers = config['rnn_layers'] self.embedding_size = config['embedding_size'] self.bidirectional = config['bidirectional'] self.build_model() # self.init_weight() def init_weight(net): for name, param in net.named_parameters(): if not 'ln' in name: if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_uniform_(param) def build_model(self): # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(inplace=True)) # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) # self.lstm_net_audio = nn.LSTM(self.embedding_size, # self.hidden_dims, # num_layers=self.rnn_layers, # dropout=self.dropout, # bidirectional=self.bidirectional, # batch_first=True) self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True) self.ln = nn.LayerNorm(self.embedding_size) # FC层 self.fc_audio = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(), nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.num_classes), # nn.ReLU(), nn.Softmax(dim=1) ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # h = lstm_out # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] # print(atten_w.shape, m.transpose(1, 2).shape) atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def forward(self, x): x = self.ln(x) x, _ = self.lstm_net_audio(x) x = x.mean(dim=1) out = self.fc_audio(x) return out class fusion_net(nn.Module): def __init__(self, text_embed_size, text_hidden_dims, rnn_layers, dropout, num_classes, \ audio_hidden_dims, audio_embed_size): super(fusion_net, self).__init__() self.text_embed_size = text_embed_size self.audio_embed_size = audio_embed_size self.text_hidden_dims = text_hidden_dims self.audio_hidden_dims = audio_hidden_dims self.rnn_layers = rnn_layers self.dropout = dropout self.num_classes = num_classes # ============================= TextBiLSTM ================================= # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.text_hidden_dims, self.text_hidden_dims), nn.ReLU(inplace=True) ) # 双层lstm self.lstm_net = nn.LSTM(self.text_embed_size, self.text_hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=True) # FC层 self.fc_out = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.text_hidden_dims, self.text_hidden_dims), nn.ReLU(), nn.Dropout(self.dropout) ) # ============================= TextBiLSTM ================================= # ============================= AudioBiLSTM ============================= self.lstm_net_audio = nn.GRU(self.audio_embed_size, self.audio_hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=False, batch_first=True) self.fc_audio = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.audio_hidden_dims, self.audio_hidden_dims), nn.ReLU(), nn.Dropout(self.dropout) ) self.ln = nn.LayerNorm(self.audio_embed_size) # ============================= AudioBiLSTM ============================= # ============================= last fc layer ============================= # self.bn = nn.BatchNorm1d(self.text_hidden_dims + self.audio_hidden_dims) # modal attention self.modal_attn = nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.text_hidden_dims + self.audio_hidden_dims, bias=False) self.fc_final = nn.Sequential( nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.num_classes, bias=False), # nn.ReLU(), nn.Softmax(dim=1), # nn.Sigmoid() ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def pretrained_feature(self, x): with torch.no_grad(): x_text = [] x_audio = [] for ele in x: x_text.append(ele[1]) x_audio.append(ele[0]) x_text, x_audio = Variable(torch.tensor(x_text).type(torch.FloatTensor), requires_grad=False), Variable(torch.tensor(x_audio).type(torch.FloatTensor), requires_grad=False) # ============================= TextBiLSTM ================================= # x : [len_seq, batch_size, embedding_dim] x_text = x_text.permute(1, 0, 2) output, (final_hidden_state, _) = self.lstm_net(x_text) # output : [batch_size, len_seq, n_hidden * 2] output = output.permute(1, 0, 2) # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] final_hidden_state = final_hidden_state.permute(1, 0, 2) # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) # atten_out = self.attention_net(output, final_hidden_state) atten_out = self.attention_net_with_w(output, final_hidden_state) text_feature = self.fc_out(atten_out) # ============================= TextBiLSTM ================================= # ============================= AudioBiLSTM ============================= x_audio = self.ln(x_audio) x_audio, _ = self.lstm_net_audio(x_audio) x_audio = x_audio.sum(dim=1) audio_feature = self.fc_audio(x_audio) # ============================= AudioBiLSTM ============================= return (text_feature, audio_feature) def forward(self, x): # x = self.bn(x) # modal_weights = torch.softmax(self.modal_attn(x), dim=1) # modal_weights = self.modal_attn(x) # x = (modal_weights * x) output = self.fc_final(x) return output class MyLoss(nn.Module): def __init__(self): super(MyLoss, self).__init__() def forward(self, text_feature, audio_feature, target, model): weight = model.fc_final[0].weight # bias = model.fc_final[0].bias # print(weight, bias) pred_text = F.linear(text_feature, weight[:, :config['text_hidden_dims']]) pred_audio = F.linear(audio_feature, weight[:, config['text_hidden_dims']:]) l = nn.CrossEntropyLoss() target = torch.tensor(target) # l = nn.BCEWithLogitsLoss() # target = F.one_hot(target, num_classes=2).type(torch.FloatTensor) # print('y: {}\npred_audio: {}\npred_text: {}\n'.format(target, pred_audio.data.max(1, keepdim=True)[1], pred_text.data.max(1, keepdim=True)[1])) # return l(pred_text, target) + l(pred_audio, target) + \ # config['lambda']*torch.norm(weight[:, :config['text_hidden_dims']]) + \ # config['lambda']*torch.norm(weight[:, config['text_hidden_dims']:]) # a = F.softmax(pred_text, dim=1) + F.softmax(pred_audio, dim=1) return l(pred_text, target) + l(pred_audio, target) config = { 'num_classes': 2, 'dropout': 0.3, 'rnn_layers': 2, 'audio_embed_size': 256, 'text_embed_size': 1024, 'batch_size': 2, 'epochs': 100, 'learning_rate': 8e-6, 'audio_hidden_dims': 256, 'text_hidden_dims': 128, 'cuda': False, 'lambda': 1e-5, } model = fusion_net(config['text_embed_size'], config['text_hidden_dims'], config['rnn_layers'], \ config['dropout'], config['num_classes'], config['audio_hidden_dims'], config['audio_embed_size']) optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) # optimizer = optim.Adam(model.parameters()) # criterion = nn.CrossEntropyLoss() criterion = MyLoss() def train(epoch, train_idxs): global max_train_acc, train_acc model.train() batch_idx = 1 total_loss = 0 correct = 0 X_train = [] Y_train = [] for idx in train_idxs: X_train.append(fuse_features[idx]) Y_train.append(fuse_targets[idx]) for i in range(0, len(X_train), config['batch_size']): if i + config['batch_size'] > len(X_train): x, y = X_train[i:], Y_train[i:] else: x, y = X_train[i:(i+config['batch_size'])], Y_train[i:(i+config['batch_size'])] if config['cuda']: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() # 将模型的参数梯度设置为0 optimizer.zero_grad() text_feature, audio_feature = model.pretrained_feature(x) # text_feature = torch.from_numpy(ss.fit_transform(text_feature.numpy())) # audio_feature = torch.from_numpy(ss.fit_transform(audio_feature.numpy())) # concat_x = torch.cat((audio_feature, text_feature), dim=1) concat_x = torch.cat((text_feature, audio_feature), dim=1) # dot_x = text_feature.mul(audio_feature) # add_x = text_feature.add(audio_feature) output = model(concat_x) pred = output.data.max(1, keepdim=True)[1] correct += pred.eq(torch.tensor(y).data.view_as(pred)).cpu().sum() # loss = criterion(output, torch.tensor(y)) loss = criterion(text_feature, audio_feature, y, model) # 后向传播调整参数 loss.backward() # 根据梯度更新网络参数 optimizer.step() batch_idx += 1 # loss.item()能够得到张量中的元素值 total_loss += loss.item() cur_loss = total_loss max_train_acc = correct train_acc = correct print('Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n '.format( epoch, config['learning_rate'], cur_loss/len(X_train), correct, len(X_train), 100. * correct / len(X_train))) def evaluate(model, test_idxs, fold, train_idxs): model.eval() batch_idx = 1 total_loss = 0 pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor) X_test = [] Y_test = [] for idx in test_idxs: X_test.append(fuse_features[idx]) Y_test.append(fuse_targets[idx]) global max_train_acc, max_acc,max_f1 for i in range(0, len(X_test), config['batch_size']): if i + config['batch_size'] > len(X_test): x, y = X_test[i:], Y_test[i:] else: x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])] if config['cuda']: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() text_feature, audio_feature = model.pretrained_feature(x) with torch.no_grad(): # concat_x = torch.cat((audio_feature, text_feature), dim=1) audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std() text_feature_norm = (text_feature - text_feature.mean())/text_feature.std() concat_x = torch.cat((text_feature, audio_feature), dim=1) output = model(concat_x) # loss = criterion(output, torch.tensor(y)) loss = criterion(text_feature, audio_feature, y, model) pred = torch.cat((pred, output.data.max(1, keepdim=True)[1])) total_loss += loss.item() y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:]) print('\nTest set: Average loss: {:.4f}'.format(total_loss/len(X_test))) # custom evaluation metrics print('Calculating additional test metrics...') accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) f1_score = 2 * (precision * recall) / (precision + recall) print("Accuracy: {}".format(accuracy)) print("Precision: {}".format(precision)) print("Recall: {}".format(recall)) print("F1-Score: {}\n".format(f1_score)) print('='*89) if max_f1 < f1_score and max_train_acc >= len(train_idxs)*0.9 and f1_score > 0.61: max_f1 = f1_score max_acc = accuracy save(model, os.path.join(prefix, 'Model/ClassificationWhole/Fuse/fuse_{:.2f}_{}'.format(max_f1, fold))) print('*'*64) print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc)) print('*'*64) return total_loss if __name__ == '__main__': idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy'] text_model_paths = ['BiLSTM_128_0.64_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.62_3.pt'] audio_model_paths = ['BiLSTM_gru_vlad256_256_0.67_1.pt', 'BiLSTM_gru_vlad256_256_0.67_2.pt', 'BiLSTM_gru_vlad256_256_0.63_3.pt'] for fold in range(1, 4): # if fold != 2: # continue train_idxs_tmp = np.load(os.path.join(prefix, 'Features/TextWhole/{}'.format(idxs_paths[fold-1])), allow_pickle=True) test_idxs_tmp = list(set(list(fuse_dep_idxs)+list(fuse_non_idxs)) - set(train_idxs_tmp)) resample_idxs = list(range(6)) train_idxs, test_idxs = [], [] # depression data augmentation for idx in train_idxs_tmp: if idx in fuse_dep_idxs: feat = fuse_features[idx] audio_perm = itertools.permutations(feat[0], 3) text_perm = itertools.permutations(feat[1], 3) count = 0 for fuse_perm in zip(audio_perm, text_perm): if count in resample_idxs: fuse_features.append(fuse_perm) fuse_targets = np.hstack((fuse_targets, 1)) train_idxs.append(len(fuse_features)-1) count += 1 else: train_idxs.append(idx) for idx in test_idxs_tmp: if idx in fuse_dep_idxs: feat = fuse_features[idx] audio_perm = itertools.permutations(feat[0], 3) text_perm = itertools.permutations(feat[1], 3) count = 0 resample_idxs = [0,1,4,5] for fuse_perm in zip(audio_perm, text_perm): if count in resample_idxs: fuse_features.append(fuse_perm) fuse_targets = np.hstack((fuse_targets, 1)) test_idxs.append(len(fuse_features)-1) count += 1 else: test_idxs.append(idx) text_lstm_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Text/{}'.format(text_model_paths[fold-1]))) audio_lstm_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Audio/{}'.format(audio_model_paths[fold-1]))) model_state_dict = {} model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0'] model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0'] model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0'] model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0'] model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1'] model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1'] model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1'] model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1'] model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight'] model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias'] model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight'] model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias'] model_state_dict['ln.weight'] = audio_lstm_model.state_dict()['ln.weight'] model_state_dict['ln.bias'] = audio_lstm_model.state_dict()['ln.bias'] model.load_state_dict(text_lstm_model.state_dict(), strict=False) # model.load_state_dict(audio_lstm_model.state_dict(), strict=False) model.load_state_dict(model_state_dict, strict=False) for param in model.parameters(): param.requires_grad = False model.fc_final[0].weight.requires_grad = True # model.fc_final[0].bias.requires_grad = True # model.modal_attn.weight.requires_grad = True max_f1 = -1 max_acc = -1 max_train_acc = -1 for ep in range(1, config['epochs']): train(ep, train_idxs) tloss = evaluate(model, test_idxs, fold, train_idxs) ================================================ FILE: DepressionCollected/Classification/text_bilstm_whole.py ================================================ import torch import torch.nn as nn from torch.autograd import Variable from torch.nn import functional as F import torch.optim as optim from sklearn.metrics import confusion_matrix from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.model_selection import train_test_split import numpy as np import pandas as pd import os import pickle import random import itertools prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0'] text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0'] text_dep_idxs_tmp = np.where(text_targets == 1)[0] text_non_idxs = np.where(text_targets == 0)[0] class TextBiLSTM(nn.Module): def __init__(self, config): super(TextBiLSTM, self).__init__() self.num_classes = config['num_classes'] self.learning_rate = config['learning_rate'] self.dropout = config['dropout'] self.hidden_dims = config['hidden_dims'] self.rnn_layers = config['rnn_layers'] self.embedding_size = config['embedding_size'] self.bidirectional = config['bidirectional'] self.build_model() self.init_weight() def init_weight(net): for name, param in net.named_parameters(): if 'ln' not in name: if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_uniform_(param) def build_model(self): # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(inplace=True) ) # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) # 双层lstm self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=self.bidirectional) # FC层 # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes) self.fc_out = nn.Sequential( # nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(), nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.num_classes), # nn.ReLU(), nn.Softmax(dim=1), ) self.ln1 = nn.LayerNorm(self.embedding_size) self.ln2 = nn.LayerNorm(self.hidden_dims) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # h = lstm_out # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def forward(self, x): # x : [len_seq, batch_size, embedding_dim] x = x.permute(1, 0, 2) # x = self.ln1(x) output, (final_hidden_state, _) = self.lstm_net(x) # output : [batch_size, len_seq, n_hidden * 2] output = output.permute(1, 0, 2) # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] final_hidden_state = final_hidden_state.permute(1, 0, 2) # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) # atten_out = self.attention_net(output, final_hidden_state) atten_out = self.attention_net_with_w(output, final_hidden_state) # atten_out = self.ln2(atten_out) return self.fc_out(atten_out) def save(model, filename): save_filename = '{}.pt'.format(filename) torch.save(model, save_filename) print('Saved as %s' % save_filename) def standard_confusion_matrix(y_test, y_test_pred): """ Make confusion matrix with format: ----------- | TP | FP | ----------- | FN | TN | ----------- Parameters ---------- y_true : ndarray - 1D y_pred : ndarray - 1D Returns ------- ndarray - 2D """ [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred) return np.array([[tp, fp], [fn, tn]]) def model_performance(y_test, y_test_pred_proba): """ Evaluation metrics for network performance. """ y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1] # Computing confusion matrix for test dataset conf_matrix = standard_confusion_matrix(y_test, y_test_pred) print("Confusion Matrix:") print(conf_matrix) return y_test_pred, conf_matrix def train(epoch, train_idxs): global lr, train_acc model.train() batch_idx = 1 total_loss = 0 correct = 0 X_train = text_features[train_idxs] Y_train = text_targets[train_idxs] for i in range(0, X_train.shape[0], config['batch_size']): if i + config['batch_size'] > X_train.shape[0]: x, y = X_train[i:], Y_train[i:] else: x, y = X_train[i:(i + config['batch_size'])], Y_train[i:( i + config['batch_size'])] if config['cuda']: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() else: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \ Variable(torch.from_numpy(y)) # 将模型的参数梯度设置为0 optimizer.zero_grad() output = model(x) pred = output.data.max(1, keepdim=True)[1] #print(pred.shape, y.shape) correct += pred.eq(y.data.view_as(pred)).cpu().sum() loss = criterion(output, y) # 后向传播调整参数 loss.backward() # 根据梯度更新网络参数 optimizer.step() batch_idx += 1 # loss.item()能够得到张量中的元素值 total_loss += loss.item() train_acc = correct print( 'Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n ' .format(epoch + 1, config['learning_rate'], total_loss, correct, X_train.shape[0], 100. * correct / X_train.shape[0])) def evaluate(model, test_idxs, fold, train_idxs): model.eval() batch_idx = 1 total_loss = 0 global max_f1, max_acc, min_mae, X_test_lens, max_prec, max_rec pred = np.array([]) with torch.no_grad(): if config['cuda']: x, y = Variable(torch.from_numpy(text_features[test_idxs]).type(torch.FloatTensor), requires_grad=True).cuda(),\ Variable(torch.from_numpy(text_targets[test_idxs])).cuda() else: x, y = Variable(torch.from_numpy(text_features[test_idxs]).type(torch.FloatTensor), requires_grad=True), \ Variable(torch.from_numpy(text_targets[test_idxs])).type(torch.LongTensor) optimizer.zero_grad() output = model(x) loss = criterion(output, y) total_loss += loss.item() y_test_pred, conf_matrix = model_performance(y, output.cpu()) accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix) precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1]) recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0]) f1_score = 2 * (precision * recall) / (precision + recall) print("Accuracy: {}".format(accuracy)) print("Precision: {}".format(precision)) print("Recall: {}".format(recall)) print("F1-Score: {}\n".format(f1_score)) print('=' * 89) if max_f1 <= f1_score and train_acc > len(train_idxs)*0.9 and f1_score > 0.5: max_f1 = f1_score max_acc = accuracy max_rec = recall max_prec = precision save(model, os.path.join(prefix, 'Model/ClassificationWhole/Text/BiLSTM_{}_{:.2f}_{}'.format(config['hidden_dims'], max_f1, fold))) print('*' * 64) print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc)) print('*' * 64) return total_loss def get_param_group(model): nd_list = [] param_list = [] for name, param in model.named_parameters(): if 'ln' in name: nd_list.append(param) else: param_list.append(param) return [{'params': param_list, 'weight_decay': 1e-5}, {'params': nd_list, 'weight_decay': 0}] config = { 'num_classes': 2, 'dropout': 0.5, 'rnn_layers': 2, 'embedding_size': 1024, 'batch_size': 4, 'epochs': 150, 'learning_rate': 1e-5, 'hidden_dims': 128, 'bidirectional': True, 'cuda': False, } train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True), np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_2.npy'), allow_pickle=True), np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)] fold = 1 for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps): # if idx_idx != 2: # continue test_idxs_tmp = list(set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp)) train_idxs, test_idxs = [], [] # depression data augmentation for idx in train_idxs_tmp: if idx in text_dep_idxs_tmp: feat = text_features[idx] count = 0 resample_idxs = [0,1,2,3,4,5] for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) text_targets = np.hstack((text_targets, 1)) train_idxs.append(len(text_features)-1) count += 1 else: train_idxs.append(idx) for idx in test_idxs_tmp: if idx in text_dep_idxs_tmp: feat = text_features[idx] count = 0 # resample_idxs = random.sample(range(6), 4) resample_idxs = [0,1,4,5] for i in itertools.permutations(feat, feat.shape[0]): if count in resample_idxs: text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) text_targets = np.hstack((text_targets, 1)) test_idxs.append(len(text_features)-1) count += 1 else: test_idxs.append(idx) model = TextBiLSTM(config) param_group = get_param_group(model) optimizer = optim.AdamW(param_group, lr=config['learning_rate']) criterion = nn.CrossEntropyLoss() max_f1 = -1 max_acc = -1 max_rec = -1 max_prec = -1 train_acc = -1 for ep in range(1, config['epochs']): train(ep, train_idxs) tloss = evaluate(model, test_idxs, fold, train_idxs) fold += 1 ================================================ FILE: DepressionCollected/Classification/text_features_whole.py ================================================ import numpy as np import pandas as pd import wave import librosa import re # from allennlp.commands.elmo import ElmoEmbedder import os prefix = os.path.abspath(os.path.join(os.getcwd(), ".")) from elmoformanylangs import Embedder import pkuseg import thulac # from pyhanlp import HanLP import jieba # seg = pkuseg.pkuseg() # thu1 = thulac.thulac(seg_only=True) elmo = Embedder('/Users/linlin/Desktop/SpeechRecognition/DepressionCode/ELMoForManyLangs/zhs.model') topics = ['positive', 'neutral', 'negative'] answers = {} text_features = [] text_targets = [] def extract_features(text_features, text_targets, path): for index in range(114): if os.path.isdir(os.path.join(prefix, path, str(index+1))): answers[index+1] = [] for topic in topics: with open(os.path.join(prefix, path, str(index+1), '%s.txt'%(topic)) ,'r') as f: lines = f.readlines()[0] # seg_text = seg.cut(lines) # seg_text = thu1.cut(lines) # seg_text_iter = HanLP.segment(lines) seg_text_iter = jieba.cut(lines, cut_all=False) answers[index+1].append([item for item in seg_text_iter]) # answers[dir].append(seg_text) with open(os.path.join(prefix, '{1}/{0}/new_label.txt'.format(index+1, path))) as fli: target = float(fli.readline()) # text_targets.append(1 if target >= 53 else 0) text_targets.append(target) text_features.append([np.array(item).mean(axis=0) for item in elmo.sents2elmo(answers[index+1])]) extract_features(text_features, text_targets, 'Data') extract_features(text_features, text_targets, 'ValidationData') print("Saving npz file locally...") np.savez(os.path.join(prefix, 'Features/TextWhole/whole_samples_reg_avg.npz'), text_features) np.savez(os.path.join(prefix, 'Features/TextWhole/whole_labels_reg_avg.npz'), text_targets) ================================================ FILE: DepressionCollected/DAICFeatureExtarction/feature_extraction.py ================================================ import os import sys sys.path.append('/Users/linlin/Desktop/DepressionCollected') from Classification.audio_features_whole import wav2vlad import numpy as np import pandas as pd import wave prefix = os.getcwd() train_split_df = pd.read_csv(os.path.join(prefix, 'DAIC/train_split_Depression_AVEC2017.csv')) test_split_df = pd.read_csv(os.path.join(prefix, 'DAIC/dev_split_Depression_AVEC2017.csv')) train_split_num = train_split_df[['Participant_ID']]['Participant_ID'].tolist() test_split_num = test_split_df[['Participant_ID']]['Participant_ID'].tolist() train_split_clabel = train_split_df[['PHQ8_Binary']]['PHQ8_Binary'].tolist() test_split_clabel = test_split_df[['PHQ8_Binary']]['PHQ8_Binary'].tolist() train_split_rlabel = train_split_df[['PHQ8_Score']]['PHQ8_Score'].tolist() test_split_rlabel = test_split_df[['PHQ8_Score']]['PHQ8_Score'].tolist() with open('./queries.txt') as f: queries = f.readlines() def identify_topics(sentence): for query in queries: query = query.strip('\n') sentence = sentence.strip('\n') if query == sentence: return True return False def extract_features(number): transcript = pd.read_csv(os.path.join(prefix, 'DAIC/{0}_P/{0}_TRANSCRIPT.csv'.format(number)), sep='\t').fillna('') wavefile = wave.open(os.path.join(prefix, 'DAIC/{0}_P/{0}_AUDIO.wav'.format(number, 'r'))) sr = wavefile.getframerate() nframes = wavefile.getnframes() wave_data = np.frombuffer(wavefile.readframes(nframes), dtype=np.short) response = '' start_time = 0 stop_time = 0 feats = [] signal = [] for t in transcript.itertuples(): # 问题开始 if getattr(t,'speaker') == 'Ellie' and (identify_topics(getattr(t,'value')) or 'i think i have asked everything' in getattr(t,'value')): # 初始化 response = '' if len(signal) == 0: continue feats.append(wav2vlad(signal, sr)) signal = [] elif getattr(t,'speaker') == 'Participant': if 'scrubbed_entry' in getattr(t,'value'): continue start_time = int(getattr(t,'start_time')*sr) stop_time = int(getattr(t,'stop_time')*sr) response += (' ' + getattr(t,'value')) signal = np.hstack((signal, wave_data[start_time:stop_time].astype(np.float))) print(np.shape(feats)) print('{}_P feature done'.format(number)) return feats # training set audio_features_train = [] audio_ctargets_train = [] audio_rtargets_train = [] # test set audio_features_test = [] audio_ctargets_test = [] audio_rtargets_test = [] # training set for index in range(len(train_split_num)): feat = extract_features(train_split_num[index]) audio_features_train.append(feat) audio_ctargets_train.append(train_split_clabel[index]) audio_rtargets_train.append(train_split_rlabel[index]) print("Saving npz file locally...") np.savez(os.path.join(prefix, 'DAICCode/Features/train_samples_clf.npz'), audio_features_train) np.savez(os.path.join(prefix, 'DAICCode/Features/train_samples_reg.npz'), audio_features_train) np.savez(os.path.join(prefix, 'DAICCode/Features/train_labels_clf.npz'), audio_ctargets_train) np.savez(os.path.join(prefix, 'DAICCode/Features/train_labels_reg.npz'), audio_rtargets_train) # test set for index in range(len(test_split_num)): feat = extract_features(test_split_num[index]) audio_features_test.append(feat) audio_ctargets_test.append(test_split_clabel[index]) audio_rtargets_test.append(test_split_rlabel[index]) print("Saving npz file locally...") np.savez(os.path.join(prefix, 'DAICCode/Features/test_samples_clf.npz'), audio_features_test) np.savez(os.path.join(prefix, 'DAICCode/Features/test_samples_reg.npz'), audio_features_test) np.savez(os.path.join(prefix, 'DAICCode/Features/test_labels_clf.npz'), audio_ctargets_test) np.savez(os.path.join(prefix, 'DAICCode/Features/test_labels_reg.npz'), audio_rtargets_test) ================================================ FILE: DepressionCollected/DAICFeatureExtarction/queries.txt ================================================ how are you doing today where are you from originally why'd you move to l_a how do you like l_a what are some things you really like about l_a how easy was it for you to get used to living in l_a what are some things you don't really like about l_a what'd you study at school are you still doing that what's your dream job do you travel a lot why how often do you go back to your hometown do you consider yourself an introvert what do you do to relax how are you at controlling your temper when was the last time you argued with someone and what was it about how did you feel in that moment tell me more about that how close are you to them how do you know them what are some things you like to do for fun who's someone that's been a positive influence in your life can you tell me about that how close are you to your family is there anything you regret what made you decide to do that could you have done anything to avoid it what's one of your most memorable experiences what's it like for you living with them how do you like your living situation do you have roommates how easy is it for you to get a good night's sleep do you feel that way often what are you like when you don't sleep well do you feel down have you been diagnosed with depression have you ever been diagnosed with p_t_s_d have you ever served in the military when was the last time you felt really happy what do you think of today's kids can you give me an example of that what do you do when you're annoyed when was the last time that happened how would your best friend describe you where do you live how hard is that what do you do now are you happy you did that what are some things that make you really mad what do you do to relax like what are you still working in that can you give me an example of that do you feel down like what how do you cope with them have you noticed any changes in your behavior or thoughts lately do you have disturbing thoughts how easy is it for you to get a good night sleep what do you enjoy about traveling i'd love to hear about one of your trips what advice would you give yourself ten or twenty years ago what are some things you really like about l_a how are you at controlling your temper has that gotten you in trouble do you find it easy to be a parent what's the hardest thing about being a parent tell me about your kids what's one of your most memorable experiences how did you feel in that moment have you ever served in the military have you been diagnosed with depression how would you best friend describe you what'd you study at school nice are you still doing that what are some things that make you really mad could you have done anything to avoid it could you say a little more about that when was the last time you argued with someone and what was it about do you travel a lot when was the last time that happened have you ever been diagnosed with p_t_s_d how would your best friend describe you when was the last time you felt really happy how did you decide to do that okay could you have done anything to avoid it do you feel like therapy is useful did you think you had a problem before you found out how has seeing a therapist affected you what sort of changes have you noticed since you've been going to therapy why did you stop who's someone that's been a positive influence in your life when did you move to l_a how often do you go back to your home town what got you to seek help what were your symptoms yeah what do you enjoy about traveling okay what's the best thing about being a parent when was the last time you argued with someone and what was it about could you say a little more about that how long ago were you diagnosed so how are you doing today could you say a little more about that do you still go to therapy now do you feel like therapy's useful have you noticed any changes in your behavior or thoughts lately tell me about that what would you say are some of your best qualities what are some things that usually put you in a good mood what are you most proud of in your life how does it compare to l_a tell me about something you did recently that you really enjoyed is going to a therapist helping you how have you been feeling lately are they triggered by something what's the best thing about being a parent why'd you decide to enlist in the military how old were you when you joined the military how did serving in the military change you what did you do after the military when'd you move to l_a how has seeing a therapist affected you who's someone that's been a positive influence in your life what are some things you like to do for fun who's someone that's been a positive influence in your life what was it about do you think that maybe you're being a little hard on yourself so how are you doing today where are you from originally how easy was it for you to get used to living in l_a what are some things you don't really like about l_a how often to you go back to your home town why how close are you to your family do you travel a lot what do you enjoy about traveling i'd love to hear about one of your trips do you consider yourself an introvert can you give me an example of that what do you do when you're annoyed what do you do to relax what's your dream job how long ago were you diagnosed what got you to seek help do you feel like therapy's useful do you still go to therapy now what sort of changes have you noticed since you've been going to therapy how have you been feeling lately tell me more about that what would you say are some of your best qualities what are some things that usually put you in a good mood when was the last time you felt really happy who's someone that's been a positive influence in your life how do you know them how close are you to them what are you most proud of in your life are you still doing that do you consider yourself an introvert do you feel that way often how do you like your living situation do you have roommates how easy is it for you to get a good night's sleep what are you like when you don't sleep well what advice would you give yourself ten or twenty years ago how close are you to your family tell me about something you did recently that you really enjoyed what are some things that usually put you in a good mood why why what made you decide to go and see someone okay so how are you doing today why'd you move to l_a how often do you go back to your hometown how did you decide to do that is there anything you regret could you have done anything to avoid it how easy is it for you to get a good night's sleep do you find it easy to be a parent what's the best thing about being a parent what's the hardest thing about being a parent and please feel free to tell me anything you answers are totally confidential and please feel free to tell me anything you're answers are totally confidential what made you decide to do that what advice would you give yourself ten or twenty years ago what do you think of today's kids tell me about that how hard is that can you tell me about that so how are you doing today are you still working in that what are some things you like to do for fun that's good where are you from originally when was the last time you argued with someone and what was it about where do you live did you think you had a problem before you found out what were your symptoms why did you stop okay so how are you doing today what do you do now are you happy you did that are they triggered by something how do you cope with them has that gotten you in trouble what are you what are some things that make you really mad how has seeing a therapist affected you yeah how hard is that mhm what are some things you don't really like about l_a mhm how did you decide to do that how close are you to your family do you find it easy to be a parent that's good what do you think of today's kids awesome how did you decide to do that uh huh uh huh uh huh is there anything you regret is there anything you regret how old were you when you joined the military did you ever see combat how did serving in the military change you what did you do after the military how easy was it for you to go back to civilian life is going to a therapist helping you that's good where are you from originally tell me about your kids yeah how hard is that do you think that maybe you're being a little hard on yourself do you consider yourself and introvert how often do you go back to your home town how_doingV (so how are you doing today) where_originally (where are you from originally) like_about_LA (what are some things you really like about l_a) dont_like_LA (what are some things you don't really like about l_a) study (what did you study at school) still_doing_X (are you still doing that) change_directions (what made you decide to do that) happy_didthat (are you happy you did that) job_virtually (i love my job you could almost say it's virtually made for me what's your dream job) shyoutgoing (do you consider yourself more shy or outgoing) tell_about_that (can you tell me about that) relax_fishtank (sometimes when i'm feeling tense i turn on the fish tank screensaver hey i know it's not hawaii but it's the best i've got what do you do to relax) control_temper (how are you at controlling your temper) last_argument (when was the last time you argued with someone and what was it about) hard_decisionB (tell me about the hardest decision you've ever had to make) family_relationship (tell me about your relationship with your family) feelguilty (what's something you feel guilty about) give_example (can you give me an example of that) describe_felt (how did you feel in that moment) ptsd_diagnosed (have you ever been diagnosed with p_t_s_d) depression_diagnosed (have you been diagnosed with depression) easy_sleep (how easy is it for you to get a good night's sleep) feel_down (do you feel down) behavior_changes (have you noticed any changes in your behavior or thoughts lately) happy_lasttime (tell me about the last time you felt really happy) self_change (what are some things you wish you could change about yourself) symptoms_cope (how do you cope with them) regret (is there anything you regret) advice_back (what advice would you give to yourself ten or twenty years ago) Ellie17Dec2012_08 (what are you most proud of in your life) difficult (how hard is that) BF_describe (how would your best friend describe you) ideal_weekendC (tell me how you spend your ideal weekend) asked_everything (okay i think i have asked everything i need to) travel_shoes (i'm sure you can tell by my shoes i'm not much of a world explorer do you travel a lot) like_what (like what) travel_trips (i'd love to hear about one of your trips) still_working_on_X (are you still working in that) dream_job (what's your dream job) situation_handled (tell me about a situation that you wish you had handled differently) why_enlist (why'd you decide to enlist in the military) old (how old were you when you joined the military) combat (did you ever see combat) why2 (why) effectB (how did serving in the military change you) after (what did you do after the military) civilian_life (how easy was it for you to go back to civilian life) feel_lately (how have you been feeling lately) therapy_useful (do you feel like therapy is useful) why_seek_help (what got you to seek help) therapy_going (do you still go to therapy now) therapist_affect (how has seeing a therapist affected you) landed_trouble (has that gotten you in trouble) when_LA (when did you move to l_a) often_backB (how often do you go back to your hometown) compares_LA (how does it compare to l_a) why_LA (why did you move to l_a) adapted_LA (how easy was it for you to get used to living in l_a) hard_decision (how did you decide to do that) easy_parent (do you find it easy to be a parent) parent_hardest (what's the hardest thing about being a parent) parent_best (what's the best thing about being a parent) parent_differences (what are some ways that you're different as a parent than your parents) military (have you ever served in the military) too_hard (do you think that maybe you're being a little hard on yourself) Ellie17Dec2012_07 (what would you say are some of your best qualities) memorableB (what's one of your most memorable experiences) travel_changed (what do you enjoy about traveling) memory_erase (tell me about an event or something that you wish you could erase from your memory) bouts_symptoms (when was the last time that happened) argument_about (what was it about) avoid (could you have done anything to avoid it) trigger (are they triggered by something) sleep_affects (what are you like when you don't sleep well) when_diagnosed (how long ago were you diagnosed) therapy_changes (what sort of changes have you noticed since you've been going to therapy) feelbadly (tell me about a time when someone made you feel really badly about yourself) more (tell me more about that) disturbing_thoughts (do you have disturbing thoughts) Ellie17Dec2012_10 (tell me about something you did recently that you really enjoyed) Ellie17Dec2012_09 (what are some things that usually put you in a good mood) do_fun (what are some things you like to do for fun) influence_positive (who's someone that's been a positive influence in your life) how_close (how close are you to them) tell_me_about (tell me about that) suspect_problem (did you think you had a problem before you found out) symptoms_what (what were your symptoms) how_know (how do you know them) therapist_useful (is going to a therapist helping you) stop_going (why did you stop) mad_makeyou (what are some things that make you really mad) where_live (where do you live) roommates (do you have roommates) living_situation (how do you like your living situation) what_do_when_annoyed (what do you do when you are annoyed) elaborate (could you say a little more about that) family_roleB (how close are you to your family) todays_kids (what do you think of today's kids) tell_me_moreV2 (can you tell me more about that) kids_elaborate (tell me about your kids) ================================================ FILE: DepressionCollected/Regression/AudioModelChecking.py ================================================ import torch import torch.nn as nn from torch.autograd import Variable from torch.nn import functional as F import torch.optim as optim from sklearn.metrics import confusion_matrix from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.model_selection import train_test_split import numpy as np import pandas as pd import os import pickle import random import itertools prefix = os.path.abspath(os.path.join(os.getcwd(), "./")) audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_256.npz'))['arr_0'], axis=2) audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_256.npz'))['arr_0'] audio_dep_idxs = np.where(audio_targets >= 53)[0] audio_non_idxs = np.where(audio_targets < 53)[0] dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True) non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True) config = { 'num_classes': 1, 'dropout': 0.5, 'rnn_layers': 2, 'embedding_size': 256, 'batch_size': 4, 'epochs': 100, 'learning_rate': 5e-5, 'hidden_dims': 256, 'bidirectional': False, 'cuda': False } class AudioBiLSTM(nn.Module): def __init__(self, config): super(AudioBiLSTM, self).__init__() self.num_classes = config['num_classes'] self.learning_rate = config['learning_rate'] self.dropout = config['dropout'] self.hidden_dims = config['hidden_dims'] self.rnn_layers = config['rnn_layers'] self.embedding_size = config['embedding_size'] self.bidirectional = config['bidirectional'] self.build_model() def init_weight(net): for name, param in net.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_uniform_(param) def build_model(self): # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(inplace=True)) # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=self.bidirectional, batch_first=True) # self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, # num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True) self.bn = nn.BatchNorm1d(3) # FC层 self.fc_audio = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(), nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.num_classes), nn.ReLU(), # nn.Softmax(dim=1) ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # h = lstm_out # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] # print(atten_w.shape, m.transpose(1, 2).shape) atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def forward(self, x): x, _ = self.lstm_net_audio(x) # x = self.bn(x) x = x.sum(dim=1) out = self.fc_audio(x) return out def save(model, filename): save_filename = '{}.pt'.format(filename) torch.save(model, save_filename) print('Saved as %s' % save_filename) def evaluate(fold, model): model.eval() batch_idx = 1 total_loss = 0 global min_mae, min_rmse, test_dep_idxs, test_non_idxs pred = np.array([]) X_test = audio_features[list(test_dep_idxs)+list(test_non_idxs)] Y_test = audio_targets[list(test_dep_idxs)+list(test_non_idxs)] with torch.no_grad(): if config['cuda']: x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\ Variable(torch.from_numpy(Y_test)).cuda() else: x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \ Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor) optimizer.zero_grad() output = model(x) loss = criterion(output, y.view_as(output)) total_loss += loss.item() pred = output.flatten().detach().numpy() mae = mean_absolute_error(Y_test, pred) rmse = np.sqrt(mean_squared_error(Y_test, pred)) print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) print('='*89) fold = 2 audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Regression/Audio%d/gru_vlad256_256_8.25.pt'%(fold+1))) model = AudioBiLSTM(config) # model_state_dict = {} # model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0'] # model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0'] # model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0'] # model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0'] # model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1'] # model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1'] # model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1'] # model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1'] # model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight'] # model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias'] # model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight'] # model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias'] model_state_dict = audio_lstm_model.state_dict() model.load_state_dict(model_state_dict, strict=True) test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10] test_non_idxs = non_idxs[fold*44:(fold+1)*44] train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp)) train_non_idxs = list(set(non_idxs) - set(test_non_idxs)) # training data augmentation train_dep_idxs = [] for (i, idx) in enumerate(train_dep_idxs_tmp): feat = audio_features[idx] if i < 14: for i in itertools.permutations(feat, feat.shape[0]): audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) audio_targets = np.hstack((audio_targets, audio_targets[idx])) train_dep_idxs.append(len(audio_features)-1) else: train_dep_idxs.append(idx) # test data augmentation # test_dep_idxs = [] # for idx in test_dep_idxs_tmp: # feat = audio_features[idx] # for i in itertools.permutations(feat, feat.shape[0]): # audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) # audio_targets = np.hstack((audio_targets, audio_targets[idx])) # test_dep_idxs.append(len(audio_features)-1) test_dep_idxs = test_dep_idxs_tmp optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) criterion = nn.SmoothL1Loss() # criterion = FocalLoss(class_num=2) # evaluate(fold, model) evaluate(fold, model) ================================================ FILE: DepressionCollected/Regression/audio_bilstm_perm.py ================================================ import torch import torch.nn as nn from torch.autograd import Variable from torch.nn import functional as F import torch.optim as optim from sklearn.metrics import confusion_matrix from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.model_selection import train_test_split import numpy as np import pandas as pd import os import pickle import random import itertools prefix = os.path.abspath(os.path.join(os.getcwd(), "./")) audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_256.npz'))['arr_0'], axis=2) audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_256.npz'))['arr_0'] # audio_dep_idxs = np.where(audio_targets >= 53)[0] # audio_non_idxs = np.where(audio_targets < 53)[0] # dep_orders = random.sample(range(len(audio_dep_idxs)), len(audio_dep_idxs)) # non_orders = random.sample(range(len(audio_non_idxs)), len(audio_non_idxs)) # dep_idxs = audio_dep_idxs[dep_orders] # non_idxs = audio_non_idxs[non_orders] # np.save(os.path.join(prefix, 'Features/AudioWhole/dep_idxs'), dep_idxs) # np.save(os.path.join(prefix, 'Features/AudioWhole/non_idxs'), non_idxs) dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True) non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True) config = { 'num_classes': 1, 'dropout': 0.5, 'rnn_layers': 2, 'embedding_size': 256, 'batch_size': 2, 'epochs': 120, 'learning_rate': 1e-5, 'hidden_dims': 256, 'bidirectional': False, 'cuda': False } class AudioBiLSTM(nn.Module): def __init__(self, config): super(AudioBiLSTM, self).__init__() self.num_classes = config['num_classes'] self.learning_rate = config['learning_rate'] self.dropout = config['dropout'] self.hidden_dims = config['hidden_dims'] self.rnn_layers = config['rnn_layers'] self.embedding_size = config['embedding_size'] self.bidirectional = config['bidirectional'] self.build_model() def init_weight(net): for name, param in net.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_uniform_(param) def build_model(self): # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(inplace=True)) # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=self.bidirectional, batch_first=True) # self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, # num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True) self.bn = nn.BatchNorm1d(3) # FC层 self.fc_audio = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(), nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.num_classes), nn.ReLU(), # nn.Softmax(dim=1) ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # h = lstm_out # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] # print(atten_w.shape, m.transpose(1, 2).shape) atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def forward(self, x): x, _ = self.lstm_net_audio(x) # x = self.bn(x) x = x.sum(dim=1) out = self.fc_audio(x) return out def save(model, filename): save_filename = '{}.pt'.format(filename) torch.save(model, save_filename) print('Saved as %s' % save_filename) def train(epoch): global lr, train_acc model.train() batch_idx = 1 total_loss = 0 correct = 0 pred = np.array([]) X_train = audio_features[train_dep_idxs+train_non_idxs] Y_train = audio_targets[train_dep_idxs+train_non_idxs] for i in range(0, X_train.shape[0], config['batch_size']): if i + config['batch_size'] > X_train.shape[0]: x, y = X_train[i:], Y_train[i:] else: x, y = X_train[i:(i + config['batch_size'])], Y_train[i:( i + config['batch_size'])] if config['cuda']: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() else: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \ Variable(torch.from_numpy(y)).type(torch.FloatTensor) # 将模型的参数梯度设置为0 optimizer.zero_grad() output = model(x) loss = criterion(output, y.view_as(output)) # 后向传播调整参数 loss.backward() # 根据梯度更新网络参数 optimizer.step() batch_idx += 1 # loss.item()能够得到张量中的元素值 pred = np.hstack((pred, output.flatten().detach().numpy())) total_loss += loss.item() train_mae = mean_absolute_error(Y_train, pred) print('Train Epoch: {:2d}\t Learning rate: {:.4f}\t Loss: {:.4f}\t MAE: {:.4f}\t RMSE: {:.4f}\n ' .format(epoch + 1, config['learning_rate'], total_loss, train_mae, \ np.sqrt(mean_squared_error(Y_train, pred)))) return train_mae def evaluate(fold, model, train_mae): model.eval() batch_idx = 1 total_loss = 0 global min_mae, min_rmse, test_dep_idxs, test_non_idxs pred = np.array([]) X_test = audio_features[list(test_dep_idxs)+list(test_non_idxs)] Y_test = audio_targets[list(test_dep_idxs)+list(test_non_idxs)] with torch.no_grad(): if config['cuda']: x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\ Variable(torch.from_numpy(Y_test)).cuda() else: x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \ Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor) optimizer.zero_grad() output = model(x) loss = criterion(output, y.view_as(output)) total_loss += loss.item() pred = output.flatten().detach().numpy() mae = mean_absolute_error(Y_test, pred) rmse = np.sqrt(mean_squared_error(Y_test, pred)) print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) print('='*89) if mae <= min_mae and mae < 8.5 and train_mae < 13: min_mae = mae min_rmse = rmse mode = 'bi' if config['bidirectional'] else 'norm' mode ='gru' save(model, os.path.join(prefix, 'Model/Regression/Audio{}/{}_vlad{}_{}_{:.2f}'.format(fold+1,mode, config['embedding_size'], config['hidden_dims'], min_mae))) print('*' * 64) print('model saved: mae: {}\t rmse: {}'.format(min_mae, min_rmse)) print('*' * 64) return total_loss for fold in range(3): test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10] test_non_idxs = non_idxs[fold*44:(fold+1)*44] train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp)) train_non_idxs = list(set(non_idxs) - set(test_non_idxs)) # training data augmentation train_dep_idxs = [] for (i, idx) in enumerate(train_dep_idxs_tmp): feat = audio_features[idx] if i < 14: for i in itertools.permutations(feat, feat.shape[0]): audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) audio_targets = np.hstack((audio_targets, audio_targets[idx])) train_dep_idxs.append(len(audio_features)-1) else: train_dep_idxs.append(idx) # test data augmentation # test_dep_idxs = [] # for idx in test_dep_idxs_tmp: # feat = audio_features[idx] # for i in itertools.permutations(feat, feat.shape[0]): # audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0))) # audio_targets = np.hstack((audio_targets, audio_targets[idx])) # test_dep_idxs.append(len(audio_features)-1) test_dep_idxs = test_dep_idxs_tmp model = AudioBiLSTM(config) if config['cuda']: model = model.cuda() # optimizer = optim.Adam(model.parameters()) optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) criterion = nn.L1Loss() # criterion = FocalLoss(class_num=2) min_mae = 100 min_rmse = 100 train_mae = 100 for ep in range(1, config['epochs']): train_mae = train(ep) tloss = evaluate(fold, model, train_mae) # ============== prep ============== # X_test = np.squeeze(np.load(os.path.join(prefix, 'Features/Audio/val_samples_reg_avid256.npz'))['arr_0'], axis=2) # Y_test = np.load(os.path.join(prefix, 'Features/Audio/val_labels_reg_avid256.npz'))['arr_0'] # ============== prep ============== # ============== SVM ============== # from sklearn.svm import SVR # from sklearn.model_selection import KFold # X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # kf = KFold(n_splits=3) # regr = SVR(kernel='linear', gamma='auto') # maes, rmses = [], [] # for train_index, test_index in kf.split(X): # # X_train, X_test = X[train_index], X[test_index] # # Y_train, Y_test = Y[train_index], Y[test_index] # X_train, Y_train = X[train_index], Y[train_index] # regr.fit([f.flatten() for f in X_train], Y_train) # pred = regr.predict([f.flatten() for f in X_test]) # mae = mean_absolute_error(Y_test, pred) # rmse = np.sqrt(mean_squared_error(Y_test, pred)) # maes.append(mae) # rmses.append(rmse) # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) # print('='*89) # # break # print(np.mean(maes), np.mean(rmses)) # ============== SVM ============== # # ============== DT ============== # from sklearn.tree import DecisionTreeRegressor # from sklearn.model_selection import KFold # X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # kf = KFold(n_splits=3) # regr = DecisionTreeRegressor(max_depth=100, random_state=0, criterion="mse") # maes, rmses = [], [] # for train_index, test_index in kf.split(X): # # X_train, X_test = X[train_index], X[test_index] # # Y_train, Y_test = Y[train_index], Y[test_index] # X_train, Y_train = X[train_index], Y[train_index] # regr.fit([f.flatten() for f in X_train], Y_train) # pred = regr.predict([f.flatten() for f in X_test]) # mae = mean_absolute_error(Y_test, pred) # rmse = np.sqrt(mean_squared_error(Y_test, pred)) # maes.append(mae) # rmses.append(rmse) # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) # print('='*89) # print(np.mean(maes), np.mean(rmses)) # # ============== DT ============== # # ============== RF ============== # from sklearn.ensemble import RandomForestRegressor # from sklearn.model_selection import KFold # X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # kf = KFold(n_splits=3) # regr = RandomForestRegressor(max_depth=100, random_state=0, criterion="mse") # maes, rmses = [], [] # for train_index, test_index in kf.split(X): # # X_train, X_test = X[train_index], X[test_index] # # Y_train, Y_test = Y[train_index], Y[test_index] # X_train, Y_train = X[train_index], Y[train_index] # regr.fit([f.flatten() for f in X_train], Y_train) # pred = regr.predict([f.flatten() for f in X_test]) # mae = mean_absolute_error(Y_test, pred) # rmse = np.sqrt(mean_squared_error(Y_test, pred)) # maes.append(mae) # rmses.append(rmse) # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) # print('='*89) # print(np.mean(maes), np.mean(rmses)) # # ============== RF ============== # ============== ada ============== # from sklearn.ensemble import AdaBoostRegressor # from sklearn.model_selection import KFold # X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # kf = KFold(n_splits=3) # regr = AdaBoostRegressor(n_estimators=50) # maes, rmses = [], [] # for train_index, test_index in kf.split(X): # # X_train, X_test = X[train_index], X[test_index] # # Y_train, Y_test = Y[train_index], Y[test_index] # X_train, Y_train = X[train_index], Y[train_index] # regr.fit([f.flatten() for f in X_train], Y_train) # pred = regr.predict([f.flatten() for f in X_test]) # mae = mean_absolute_error(Y_test, pred) # rmse = np.sqrt(mean_squared_error(Y_test, pred)) # maes.append(mae) # rmses.append(rmse) # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) # print('='*89) # print(np.mean(maes), np.mean(rmses)) # ============== ada ============== ================================================ FILE: DepressionCollected/Regression/fuse_net.py ================================================ import torch import torch.nn as nn from torch.autograd import Variable from torch.nn import functional as F import torch.optim as optim from sklearn.metrics import mean_absolute_error, mean_squared_error import numpy as np import pandas as pd import wave import librosa from python_speech_features import * import re from allennlp.commands.elmo import ElmoEmbedder import os import tensorflow.compat.v1 as tf import itertools prefix = os.path.abspath(os.path.join(os.getcwd(), "./")) text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_reg_avg.npz'))['arr_0'] text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_reg_avg.npz'))['arr_0'] audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_256.npz'))['arr_0'], axis=2) audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_256.npz'))['arr_0'] fuse_features = [[audio_features[i], text_features[i]] for i in range(text_features.shape[0])] fuse_targets = text_targets fuse_dep_idxs = np.where(text_targets >= 53)[0] fuse_non_idxs = np.where(text_targets < 53)[0] dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True) non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True) text_model_paths = ['Model/Regression/Text1/BiLSTM_128_7.75.pt', 'Model/Regression/Text2/BiLSTM_128_8.46.pt', 'Model/Regression/Text3/BiLSTM_128_8.01.pt'] audio_model_paths = ['Model/Regression/Audio1/gru_vlad256_256_7.60.pt', 'Model/Regression/Audio2/gru_vlad256_256_8.38.pt', 'Model/Regression/Audio3/gru_vlad256_256_8.25.pt'] config = { 'num_classes': 1, 'dropout': 0.5, 'rnn_layers': 2, 'audio_embed_size': 256, 'text_embed_size': 1024, 'batch_size': 4, 'epochs': 150, 'learning_rate': 8e-5, 'audio_hidden_dims': 256, 'text_hidden_dims': 128, 'cuda': False, 'lambda': 1e-2, } class TextBiLSTM(nn.Module): def __init__(self, config): super(TextBiLSTM, self).__init__() self.num_classes = config['num_classes'] self.learning_rate = config['learning_rate'] self.dropout = config['dropout'] self.hidden_dims = config['hidden_dims'] self.rnn_layers = config['rnn_layers'] self.embedding_size = config['embedding_size'] self.bidirectional = config['bidirectional'] self.build_model() self.init_weight() def init_weight(net): for name, param in net.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_uniform_(param) def build_model(self): # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(inplace=True) ) # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) # 双层lstm self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=self.bidirectional) # self.init_weight() # FC层 # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes) self.fc_out = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(), nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.num_classes), nn.ReLU(), # nn.Softmax(dim=1), ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # h = lstm_out # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def forward(self, x): # x : [len_seq, batch_size, embedding_dim] x = x.permute(1, 0, 2) output, (final_hidden_state, final_cell_state) = self.lstm_net(x) # output : [batch_size, len_seq, n_hidden * 2] output = output.permute(1, 0, 2) # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] final_hidden_state = final_hidden_state.permute(1, 0, 2) # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) # atten_out = self.attention_net(output, final_hidden_state) atten_out = self.attention_net_with_w(output, final_hidden_state) return self.fc_out(atten_out) class AudioBiLSTM(nn.Module): def __init__(self, config): super(AudioBiLSTM, self).__init__() self.num_classes = config['num_classes'] self.learning_rate = config['learning_rate'] self.dropout = config['dropout'] self.hidden_dims = config['hidden_dims'] self.rnn_layers = config['rnn_layers'] self.embedding_size = config['embedding_size'] self.bidirectional = config['bidirectional'] self.build_model() def init_weight(net): for name, param in net.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_uniform_(param) def build_model(self): # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(inplace=True)) # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=self.bidirectional, batch_first=True) # self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims, # num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True) self.bn = nn.BatchNorm1d(3) # FC层 self.fc_audio = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(), nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.num_classes), nn.ReLU(), # nn.Softmax(dim=1) ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # h = lstm_out # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] # print(atten_w.shape, m.transpose(1, 2).shape) atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def forward(self, x): x, _ = self.lstm_net_audio(x) # x = self.bn(x) x = x.sum(dim=1) out = self.fc_audio(x) return out class fusion_net(nn.Module): def __init__(self, text_embed_size, text_hidden_dims, rnn_layers, dropout, num_classes, \ audio_hidden_dims, audio_embed_size): super(fusion_net, self).__init__() self.text_embed_size = text_embed_size self.audio_embed_size = audio_embed_size self.text_hidden_dims = text_hidden_dims self.audio_hidden_dims = audio_hidden_dims self.rnn_layers = rnn_layers self.dropout = dropout self.num_classes = num_classes # ============================= TextBiLSTM ================================= # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.text_hidden_dims, self.text_hidden_dims), nn.ReLU(inplace=True) ) # 双层lstm self.lstm_net = nn.LSTM(self.text_embed_size, self.text_hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=True) # FC层 self.fc_out = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.text_hidden_dims, self.text_hidden_dims), nn.ReLU(), nn.Dropout(self.dropout) ) # ============================= TextBiLSTM ================================= # ============================= AudioBiLSTM ============================= self.lstm_net_audio = nn.GRU(self.audio_embed_size, self.audio_hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=False, batch_first=True) self.fc_audio = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.audio_hidden_dims, self.audio_hidden_dims), nn.ReLU(), nn.Dropout(self.dropout) ) # ============================= AudioBiLSTM ============================= # ============================= last fc layer ============================= # self.bn = nn.BatchNorm1d(self.text_hidden_dims + self.audio_hidden_dims) # modal attention self.modal_attn = nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.text_hidden_dims + self.audio_hidden_dims, bias=False) self.fc_final = nn.Sequential( nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.num_classes, bias=False), nn.ReLU(), # nn.Softmax(dim=1), # nn.Sigmoid() ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def pretrained_feature(self, x): with torch.no_grad(): x_text = [] x_audio = [] for ele in x: x_text.append(ele[1]) x_audio.append(ele[0]) x_text, x_audio = Variable(torch.tensor(x_text).type(torch.FloatTensor), requires_grad=False), Variable(torch.tensor(x_audio).type(torch.FloatTensor), requires_grad=False) # ============================= TextBiLSTM ================================= # x : [len_seq, batch_size, embedding_dim] x_text = x_text.permute(1, 0, 2) output, (final_hidden_state, _) = self.lstm_net(x_text) # output : [batch_size, len_seq, n_hidden * 2] output = output.permute(1, 0, 2) # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] final_hidden_state = final_hidden_state.permute(1, 0, 2) # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) # atten_out = self.attention_net(output, final_hidden_state) atten_out = self.attention_net_with_w(output, final_hidden_state) text_feature = self.fc_out(atten_out) # ============================= TextBiLSTM ================================= # ============================= AudioBiLSTM ============================= x_audio, _ = self.lstm_net_audio(x_audio) x_audio = x_audio.sum(dim=1) audio_feature = self.fc_audio(x_audio) # ============================= AudioBiLSTM ============================= return (text_feature, audio_feature) def forward(self, x): # x = self.bn(x) modal_weights = torch.sigmoid(self.modal_attn(x)) # modal_weights = self.modal_attn(x) x = (modal_weights * x) output = self.fc_final(x) return output class MyLoss(nn.Module): def __init__(self): super(MyLoss, self).__init__() def forward(self, text_feature, audio_feature, target, model): weight = model.fc_final[0].weight # bias = model.fc_final[0].bias # print(weight, bias) pred_text = F.linear(text_feature, weight[:, :config['text_hidden_dims']]) pred_audio = F.linear(audio_feature, weight[:, config['text_hidden_dims']:]) # l = nn.CrossEntropyLoss() l = nn.SmoothL1Loss() target = torch.tensor(target).view_as(pred_text).float() return l(pred_text, target) + l(pred_audio, target) def save(model, filename): save_filename = '{}.pt'.format(filename) torch.save(model, save_filename) print('Saved as %s' % save_filename) def train(model, epoch): global max_train_acc, train_acc model.train() batch_idx = 1 total_loss = 0 correct = 0 pred = np.array([]) X_train = [] Y_train = [] for idx in train_dep_idxs+train_non_idxs: X_train.append(fuse_features[idx]) Y_train.append(fuse_targets[idx]) for i in range(0, len(X_train), config['batch_size']): if i + config['batch_size'] > len(X_train): x, y = X_train[i:], Y_train[i:] else: x, y = X_train[i:(i+config['batch_size'])], Y_train[i:(i+config['batch_size'])] # 将模型的参数梯度设置为0 optimizer.zero_grad() text_feature, audio_feature = model.pretrained_feature(x) audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std() text_feature_norm = (text_feature - text_feature.mean())/text_feature.std() # concat_x = torch.cat((text_feature_norm, audio_feature_norm), dim=1) concat_x = torch.cat((text_feature, audio_feature), dim=1) output = model(concat_x) # loss = criterion(output, torch.tensor(y).float()) loss = criterion(text_feature, audio_feature, y, model) # 后向传播调整参数 loss.backward() # 根据梯度更新网络参数 optimizer.step() batch_idx += 1 # loss.item()能够得到张量中的元素值 pred = np.hstack((pred, output.flatten().detach().numpy())) total_loss += loss.item() train_mae = mean_absolute_error(Y_train, pred) print('Train Epoch: {:2d}\t Learning rate: {:.4f}\t Loss: {:.4f}\t MAE: {:.4f}\t RMSE: {:.4f}\n ' .format(epoch + 1, config['learning_rate'], total_loss, train_mae, \ np.sqrt(mean_squared_error(Y_train, pred)))) return train_mae def evaluate(model, fold, train_mae): model.eval() batch_idx = 1 total_loss = 0 global min_mae, min_rmse, test_dep_idxs, test_non_idxs pred = np.array([]) X_test = [] Y_test = [] for idx in list(test_dep_idxs)+list(test_non_idxs): X_test.append(fuse_features[idx]) Y_test.append(fuse_targets[idx]) for i in range(0, len(X_test), config['batch_size']): if i + config['batch_size'] > len(X_test): x, y = X_test[i:], Y_test[i:] else: x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])] text_feature, audio_feature = model.pretrained_feature(x) with torch.no_grad(): audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std() text_feature_norm = (text_feature - text_feature.mean())/text_feature.std() concat_x = torch.cat((text_feature, audio_feature), dim=1) # concat_x = torch.cat((text_feature_norm, audio_feature_norm), dim=1) output = model(concat_x) # loss = criterion(output, torch.tensor(y).float()) loss = criterion(text_feature, audio_feature, y, model) pred = np.hstack((pred, output.flatten().detach().numpy())) total_loss += loss.item() mae = mean_absolute_error(Y_test, pred) rmse = np.sqrt(mean_squared_error(Y_test, pred)) print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) print('='*89) if mae <= min_mae and mae < 8.2 and train_mae < 13: min_mae = mae min_rmse = rmse save(model, os.path.join(prefix, 'Model/Regression/Fuse{}/fuse_{:.2f}'.format(fold+1, min_mae))) print('*' * 64) print('model saved: mae: {}\t rmse: {}'.format(min_mae, min_rmse)) print('*' * 64) return total_loss def evaluate_audio(model): model.eval() batch_idx = 1 total_loss = 0 global min_mae, min_rmse, test_dep_idxs, test_non_idxs pred = np.array([]) X_test = [] Y_test = [] for idx in list(test_dep_idxs)+list(test_non_idxs): X_test.append(fuse_features[idx][0]) Y_test.append(fuse_targets[idx]) X_test = np.array(X_test) Y_test = np.array(Y_test) with torch.no_grad(): if config['cuda']: x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\ Variable(torch.from_numpy(Y_test)).cuda() else: x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \ Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor) optimizer.zero_grad() output = model(x) loss = criterion(output, y.view_as(output)) total_loss += loss.item() pred = output.flatten().detach().numpy() mae = mean_absolute_error(Y_test, pred) rmse = np.sqrt(mean_squared_error(Y_test, pred)) print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) print('='*89) def evaluate_text(model): model.eval() batch_idx = 1 total_loss = 0 global min_mae, min_rmse, test_dep_idxs, test_non_idxs pred = np.array([]) X_test = [] Y_test = [] for idx in list(test_dep_idxs)+list(test_non_idxs): X_test.append(fuse_features[idx][1]) Y_test.append(fuse_targets[idx]) X_test = np.array(X_test) Y_test = np.array(Y_test) criterion = nn.SmoothL1Loss() with torch.no_grad(): if config['cuda']: x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\ Variable(torch.from_numpy(Y_test)).cuda() else: x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \ Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor) optimizer.zero_grad() output = model(x) loss = criterion(output, y.view_as(output)) total_loss += loss.item() pred = output.flatten().detach().numpy() mae = mean_absolute_error(Y_test, pred) rmse = np.sqrt(mean_squared_error(Y_test, pred)) print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) print('='*89) for fold in range(3): test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10] test_non_idxs = non_idxs[fold*44:(fold+1)*44] train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp)) train_non_idxs = list(set(non_idxs) - set(test_non_idxs)) train_dep_idxs = [] test_dep_idxs = [] # depression data augmentation for (i, idx) in enumerate(train_dep_idxs_tmp): feat = fuse_features[idx] audio_perm = itertools.permutations(feat[0], 3) text_perm = itertools.permutations(feat[1], 3) if i < 14: for fuse_perm in zip(audio_perm, text_perm): fuse_features.append(fuse_perm) fuse_targets = np.hstack((fuse_targets, fuse_targets[idx])) train_dep_idxs.append(len(fuse_features)-1) else: train_dep_idxs.append(idx) test_dep_idxs = test_dep_idxs_tmp model = fusion_net(config['text_embed_size'], config['text_hidden_dims'], config['rnn_layers'], \ config['dropout'], config['num_classes'], config['audio_hidden_dims'], config['audio_embed_size']) optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) # optimizer = optim.Adam(model.parameters()) # criterion = nn.SmoothL1Loss() criterion = MyLoss() text_lstm_model = torch.load(os.path.join(prefix, text_model_paths[fold])) audio_lstm_model = torch.load(os.path.join(prefix, audio_model_paths[fold])) model_state_dict = {} model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0'] model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0'] model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0'] model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0'] model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1'] model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1'] model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1'] model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1'] model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight'] model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias'] model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight'] model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias'] model.load_state_dict(text_lstm_model.state_dict(), strict=False) # model.load_state_dict(audio_lstm_model.state_dict(), strict=False) model.load_state_dict(model_state_dict, strict=False) for param in model.parameters(): param.requires_grad = True model.fc_final[0].weight.requires_grad = True # model.fc_final[0].bias.requires_grad = True model.modal_attn.weight.requires_grad = True min_mae = 100 min_rmse = 100 train_mae = 100 for ep in range(1, config['epochs']): train_mae = train(model, ep) tloss = evaluate(model, fold, train_mae) # evaluate_audio(audio_lstm_model) # evaluate_text(text_lstm_model) ================================================ FILE: DepressionCollected/Regression/text_bilstm_perm.py ================================================ import torch import torch.nn as nn from torch.autograd import Variable from torch.nn import functional as F import torch.optim as optim from sklearn.metrics import confusion_matrix from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.model_selection import train_test_split import numpy as np import pandas as pd import os import pickle import random import itertools prefix = os.path.abspath(os.path.join(os.getcwd(), "../")) text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_reg_avg.npz'))['arr_0'] text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_reg_avg.npz'))['arr_0'] dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True) non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True) config = { 'num_classes': 1, 'dropout': 0.5, 'rnn_layers': 2, 'embedding_size': 1024, 'batch_size': 2, 'epochs': 110, 'learning_rate': 1e-5, 'hidden_dims': 128, 'bidirectional': True, 'cuda': False, } class TextBiLSTM(nn.Module): def __init__(self, config): super(TextBiLSTM, self).__init__() self.num_classes = config['num_classes'] self.learning_rate = config['learning_rate'] self.dropout = config['dropout'] self.hidden_dims = config['hidden_dims'] self.rnn_layers = config['rnn_layers'] self.embedding_size = config['embedding_size'] self.bidirectional = config['bidirectional'] self.build_model() self.init_weight() def init_weight(net): for name, param in net.named_parameters(): if 'bias' in name: nn.init.constant_(param, 0.0) elif 'weight' in name: nn.init.xavier_uniform_(param) def build_model(self): # attention layer self.attention_layer = nn.Sequential( nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(inplace=True) ) # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1) # 双层lstm self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims, num_layers=self.rnn_layers, dropout=self.dropout, bidirectional=self.bidirectional) # self.init_weight() # FC层 # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes) self.fc_out = nn.Sequential( nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.hidden_dims), nn.ReLU(), nn.Dropout(self.dropout), nn.Linear(self.hidden_dims, self.num_classes), nn.ReLU(), # nn.Softmax(dim=1), ) def attention_net_with_w(self, lstm_out, lstm_hidden): ''' :param lstm_out: [batch_size, len_seq, n_hidden * 2] :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden] :return: [batch_size, n_hidden] ''' lstm_tmp_out = torch.chunk(lstm_out, 2, -1) # h [batch_size, time_step, hidden_dims] h = lstm_tmp_out[0] + lstm_tmp_out[1] # h = lstm_out # [batch_size, num_layers * num_directions, n_hidden] lstm_hidden = torch.sum(lstm_hidden, dim=1) # [batch_size, 1, n_hidden] lstm_hidden = lstm_hidden.unsqueeze(1) # atten_w [batch_size, 1, hidden_dims] atten_w = self.attention_layer(lstm_hidden) # m [batch_size, time_step, hidden_dims] m = nn.Tanh()(h) # atten_context [batch_size, 1, time_step] atten_context = torch.bmm(atten_w, m.transpose(1, 2)) # softmax_w [batch_size, 1, time_step] softmax_w = F.softmax(atten_context, dim=-1) # context [batch_size, 1, hidden_dims] context = torch.bmm(softmax_w, h) result = context.squeeze(1) return result def forward(self, x): # x : [len_seq, batch_size, embedding_dim] x = x.permute(1, 0, 2) output, (final_hidden_state, final_cell_state) = self.lstm_net(x) # output : [batch_size, len_seq, n_hidden * 2] output = output.permute(1, 0, 2) # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden] final_hidden_state = final_hidden_state.permute(1, 0, 2) # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True) # atten_out = self.attention_net(output, final_hidden_state) atten_out = self.attention_net_with_w(output, final_hidden_state) return self.fc_out(atten_out) def save(model, filename): save_filename = '{}.pt'.format(filename) torch.save(model, save_filename) print('Saved as %s' % save_filename) def train(epoch): global lr, train_acc model.train() batch_idx = 1 total_loss = 0 correct = 0 pred = np.array([]) X_train = text_features[train_dep_idxs+train_non_idxs] Y_train = text_targets[train_dep_idxs+train_non_idxs] for i in range(0, X_train.shape[0], config['batch_size']): if i + config['batch_size'] > X_train.shape[0]: x, y = X_train[i:], Y_train[i:] else: x, y = X_train[i:(i + config['batch_size'])], Y_train[i:( i + config['batch_size'])] if config['cuda']: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda() else: x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \ Variable(torch.from_numpy(y)).type(torch.FloatTensor) # 将模型的参数梯度设置为0 optimizer.zero_grad() output = model(x) loss = criterion(output, y.view_as(output)) # 后向传播调整参数 loss.backward() # 根据梯度更新网络参数 optimizer.step() batch_idx += 1 # loss.item()能够得到张量中的元素值 pred = np.hstack((pred, output.flatten().detach().numpy())) total_loss += loss.item() train_mae = mean_absolute_error(Y_train, pred) print('Train Epoch: {:2d}\t Learning rate: {:.4f}\t Loss: {:.4f}\t MAE: {:.4f}\t RMSE: {:.4f}\n ' .format(epoch + 1, config['learning_rate'], total_loss, train_mae, \ np.sqrt(mean_squared_error(Y_train, pred)))) return train_mae def evaluate(fold, model, train_mae): model.eval() batch_idx = 1 total_loss = 0 global min_mae, min_rmse, test_dep_idxs, test_non_idxs pred = np.array([]) X_test = text_features[list(test_dep_idxs)+list(test_non_idxs)] Y_test = text_targets[list(test_dep_idxs)+list(test_non_idxs)] with torch.no_grad(): if config['cuda']: x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\ Variable(torch.from_numpy(Y_test)).cuda() else: x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \ Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor) optimizer.zero_grad() output = model(x) loss = criterion(output, y.view_as(output)) total_loss += loss.item() pred = output.flatten().detach().numpy() mae = mean_absolute_error(Y_test, pred) rmse = np.sqrt(mean_squared_error(Y_test, pred)) print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) print('='*89) if mae <= min_mae and mae < 8.5 and train_mae < 13: min_mae = mae min_rmse = rmse mode = 'bi' if config['bidirectional'] else 'norm' mode ='gru' save(model, os.path.join(prefix, 'Model/Regression/Text{}/BiLSTM_{}_{:.2f}'.format(fold+1, config['hidden_dims'], min_mae))) print('*' * 64) print('model saved: mae: {}\t rmse: {}'.format(min_mae, min_rmse)) print('*' * 64) return total_loss for fold in range(3): test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10] test_non_idxs = non_idxs[fold*44:(fold+1)*44] train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp)) train_non_idxs = list(set(non_idxs) - set(test_non_idxs)) # training data augmentation train_dep_idxs = [] for (i, idx) in enumerate(train_dep_idxs_tmp): feat = text_features[idx] if i < 14: for i in itertools.permutations(feat, feat.shape[0]): text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) text_targets = np.hstack((text_targets, text_targets[idx])) train_dep_idxs.append(len(text_features)-1) else: train_dep_idxs.append(idx) # test data augmentation # test_dep_idxs = [] # for idx in test_dep_idxs_tmp: # feat = text_features[idx] # for i in itertools.permutations(feat, feat.shape[0]): # text_features = np.vstack((text_features, np.expand_dims(list(i), 0))) # text_targets = np.hstack((text_targets, text_targets[idx])) # test_dep_idxs.append(len(text_features)-1) test_dep_idxs = test_dep_idxs_tmp model = TextBiLSTM(config) if config['cuda']: model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=config['learning_rate']) criterion = nn.SmoothL1Loss() # criterion = FocalLoss(class_num=2) min_mae = 100 min_rmse = 100 train_mae = 100 for ep in range(1, config['epochs']): train_mae = train(ep) tloss = evaluate(fold, model, train_mae) # ============== prep ============== # X_test = np.squeeze(np.load(os.path.join(prefix, 'Features/Audio/val_samples_reg_avid256.npz'))['arr_0'], axis=2) # Y_test = np.load(os.path.join(prefix, 'Features/Audio/val_labels_reg_avid256.npz'))['arr_0'] # ============== prep ============== # ============== SVM ============== # from sklearn.svm import SVR # from sklearn.model_selection import KFold # X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # kf = KFold(n_splits=3) # regr = SVR(kernel='linear', gamma='auto') # maes, rmses = [], [] # for train_index, test_index in kf.split(X): # # X_train, X_test = X[train_index], X[test_index] # # Y_train, Y_test = Y[train_index], Y[test_index] # X_train, Y_train = X[train_index], Y[train_index] # regr.fit([f.flatten() for f in X_train], Y_train) # pred = regr.predict([f.flatten() for f in X_test]) # mae = mean_absolute_error(Y_test, pred) # rmse = np.sqrt(mean_squared_error(Y_test, pred)) # maes.append(mae) # rmses.append(rmse) # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) # print('='*89) # # break # print(np.mean(maes), np.mean(rmses)) # ============== SVM ============== # # ============== DT ============== # from sklearn.tree import DecisionTreeRegressor # from sklearn.model_selection import KFold # X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # kf = KFold(n_splits=3) # regr = DecisionTreeRegressor(max_depth=100, random_state=0, criterion="mse") # maes, rmses = [], [] # for train_index, test_index in kf.split(X): # # X_train, X_test = X[train_index], X[test_index] # # Y_train, Y_test = Y[train_index], Y[test_index] # X_train, Y_train = X[train_index], Y[train_index] # regr.fit([f.flatten() for f in X_train], Y_train) # pred = regr.predict([f.flatten() for f in X_test]) # mae = mean_absolute_error(Y_test, pred) # rmse = np.sqrt(mean_squared_error(Y_test, pred)) # maes.append(mae) # rmses.append(rmse) # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) # print('='*89) # print(np.mean(maes), np.mean(rmses)) # # ============== DT ============== # # ============== RF ============== # from sklearn.ensemble import RandomForestRegressor # from sklearn.model_selection import KFold # X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # kf = KFold(n_splits=3) # regr = RandomForestRegressor(max_depth=100, random_state=0, criterion="mse") # maes, rmses = [], [] # for train_index, test_index in kf.split(X): # # X_train, X_test = X[train_index], X[test_index] # # Y_train, Y_test = Y[train_index], Y[test_index] # X_train, Y_train = X[train_index], Y[train_index] # regr.fit([f.flatten() for f in X_train], Y_train) # pred = regr.predict([f.flatten() for f in X_test]) # mae = mean_absolute_error(Y_test, pred) # rmse = np.sqrt(mean_squared_error(Y_test, pred)) # maes.append(mae) # rmses.append(rmse) # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) # print('='*89) # print(np.mean(maes), np.mean(rmses)) # # ============== RF ============== # ============== ada ============== # from sklearn.ensemble import AdaBoostRegressor # from sklearn.model_selection import KFold # X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs] # kf = KFold(n_splits=3) # regr = AdaBoostRegressor(n_estimators=50) # maes, rmses = [], [] # for train_index, test_index in kf.split(X): # # X_train, X_test = X[train_index], X[test_index] # # Y_train, Y_test = Y[train_index], Y[test_index] # X_train, Y_train = X[train_index], Y[train_index] # regr.fit([f.flatten() for f in X_train], Y_train) # pred = regr.predict([f.flatten() for f in X_test]) # mae = mean_absolute_error(Y_test, pred) # rmse = np.sqrt(mean_squared_error(Y_test, pred)) # maes.append(mae) # rmses.append(rmse) # print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse)) # print('='*89) # print(np.mean(maes), np.mean(rmses)) # ============== ada ============== ================================================ FILE: README.md ================================================ # ICASSP2022-Depression Automatic Depression Detection: a GRU/ BiLSTM-based Model and An Emotional Audio-Textual Corpus https://arxiv.org/pdf/2202.08210.pdf https://ieeexplore.ieee.org/abstract/document/9746569/ ## Code - Regression - audio_bilstm_perm.py: train audio network - text_bilstm_perm.py: train text network - fuse_net.py: train multi-modal network - Classification - audio_features_whole.py: extract audio features - text_features_whole.py: extract text features - audio_gru_whole.py: train audio network - text_bilstm_whole.py: train text network - fuse_net_whole.py: train fuse network ## Dataset: EATD-Corpus The EATD-Corpus is a dataset consist of audio and text files of 162 volunteers who received counseling. ### How to download The EATD-Corpus can be downloaded at https://1drv.ms/u/s!AsGVGqImbOwYhHUHcodFC3xmKZKK?e=mCT5oN. Password: Ymj26Uv5 ### How to use Training set contains data from 83 volunteers (19 depressed and 64 non-depressed). Validation set contains data from 79 volunteers (11 depressed and 68 non-depressed). Each folder contains depression data for one volunteer. - {positive/negative/neutral}.wav: Raw audio in wav - {positive/negative/neutral}_out.wav: Preprocessed audio. Preprocessing operations include denoising and de-muting - {positive/negative/neutral}.txt: Audio translation - label.txt: Raw SDS score - new_label.txt: Standard SDS score (Raw SDS score multiplied by 1.25)