Repository: speechandlanguageprocessing/ICASSP2022-Depression
Branch: main
Commit: eded8cc0818d
Files: 17
Total size: 174.0 KB

Directory structure:
gitextract_kdhj1m2d/

├── DepressionCollected/
│   ├── Classification/
│   │   ├── AudioModelChecking.py
│   │   ├── AudioTraditionalClassifiers.py
│   │   ├── FuseModelChecking.py
│   │   ├── TextModelChecking.py
│   │   ├── TextTraditionalClassifiers.py
│   │   ├── audio_features_whole.py
│   │   ├── audio_gru_whole.py
│   │   ├── fuse_net_whole.py
│   │   ├── text_bilstm_whole.py
│   │   └── text_features_whole.py
│   ├── DAICFeatureExtarction/
│   │   ├── feature_extraction.py
│   │   └── queries.txt
│   └── Regression/
│       ├── AudioModelChecking.py
│       ├── audio_bilstm_perm.py
│       ├── fuse_net.py
│       └── text_bilstm_perm.py
└── README.md

================================================
FILE CONTENTS
================================================

================================================
FILE: DepressionCollected/Classification/AudioModelChecking.py
================================================
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import wave
import re
import os
import tensorflow.compat.v1 as tf
import random
import itertools
from audio_gru_whole import AudioBiLSTM

from sklearn.preprocessing import StandardScaler
import pickle

class BiLSTM(nn.Module):
    def __init__(self, rnn_layers, dropout, num_classes, audio_hidden_dims, audio_embed_size):
        super(BiLSTM, self).__init__()

        self.lstm_net_audio = nn.GRU(audio_embed_size, audio_hidden_dims,
                                num_layers=rnn_layers, dropout=dropout, batch_first=True)

        self.fc_audio = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(audio_hidden_dims, audio_hidden_dims),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(audio_hidden_dims, num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        x, _ = self.lstm_net_audio(x)
        # x = self.bn(x)
        x = x.sum(dim=1)
        out = self.fc_audio(x)
        return out

# prefix = os.path.abspath(os.path.join(os.getcwd(), "."))
# audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/Audio/whole_samples_clf_avid256.npz'))['arr_0'], axis=2)
# audio_targets = np.load(os.path.join(prefix, 'Features/Audio/whole_labels_clf_avid256.npz'))['arr_0']

prefix = os.path.abspath(os.path.join(os.getcwd(), "."))
audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2)
audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0']

audio_dep_idxs = np.where(audio_targets == 1)[0]
audio_non_idxs = np.where(audio_targets == 0)[0]

def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
    # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

config = {
    'num_classes': 2,
    'dropout': 0.5,
    'rnn_layers': 2,
    'embedding_size': 256,
    'batch_size': 4,
    'epochs': 100,
    'learning_rate': 1e-5,
    'hidden_dims': 256,
    'bidirectional': False,
    'cuda': False
}

# audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio/BiLSTM_gru_vlad256_256_0.80.pt'))
# audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio3/BiLSTM_gru_vlad256_256_0.89.pt'))
# audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Classification/Audio2/BiLSTM_gru_vlad256_256_0.65.pt'))

# model = BiLSTM(config['rnn_layers'], config['dropout'], config['num_classes'], \
#          config['hidden_dims'], config['embedding_size'])
         
# model_state_dict = {}
# model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0']
# model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0']
# model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0']
# model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0']

# model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1']
# model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1']
# model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1']
# model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1']

# model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight']
# model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias']
# model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight']
# model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias']
# model_state_dict = audio_lstm_model.state_dict()
# model.load_state_dict(model_state_dict, strict=False)

def evaluate(model, test_idxs):
    model.eval()
    batch_idx = 1
    total_loss = 0
    pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor)
    # X_test = audio_features[test_dep_idxs+test_non_idxs]
    # Y_test = audio_targets[test_dep_idxs+test_non_idxs]
    X_test = audio_features[test_idxs]
    Y_test = audio_targets[test_idxs]
    global max_train_acc, max_acc,max_f1
    for i in range(0, X_test.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_test.shape[0]:
            x, y = X_test[i:], Y_test[i:]
        else:
            x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        else:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), Variable(torch.from_numpy(y))
        with torch.no_grad():
            output = model(x.squeeze(2))
        pred = torch.cat((pred, output.data.max(1, keepdim=True)[1]))
        
    y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:])
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)
    return precision, recall, f1_score


# evaluate(audio_features_test, fuse_targets_test, audio_lstm_model)
# evaluate(model)

idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy']
audio_model_paths = ['BiLSTM_gru_vlad256_256_0.67_1.pt', 'BiLSTM_gru_vlad256_256_0.67_2.pt', 'BiLSTM_gru_vlad256_256_0.63_3.pt']
ps, rs, fs = [], [], []
for fold in range(3):
    train_idxs_tmp = np.load(os.path.join(prefix, 'Features/TextWhole/{}'.format(idxs_paths[fold])), allow_pickle=True)
    test_idxs_tmp = list(set(list(audio_dep_idxs)+list(audio_non_idxs)) - set(train_idxs_tmp))
    audio_lstm_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Audio/{}'.format(audio_model_paths[fold])))

    train_idxs, test_idxs = [], []
    for idx in train_idxs_tmp:
        if idx in audio_dep_idxs:
            feat = audio_features[idx]
            count = 0
            resample_idxs = [0,1,2,3,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                    audio_targets = np.hstack((audio_targets, 1))
                    train_idxs.append(len(audio_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in audio_dep_idxs:
            feat = audio_features[idx]
            count = 0
            # resample_idxs = random.sample(range(6), 4)
            resample_idxs = [0,1,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                    audio_targets = np.hstack((audio_targets, 1))
                    test_idxs.append(len(audio_features)-1)
                count += 1
        else:
            test_idxs.append(idx)
    p, r, f = evaluate(audio_lstm_model, test_idxs)
    ps.append(p)
    rs.append(r)
    fs.append(f)
print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs)))


================================================
FILE: DepressionCollected/Classification/AudioTraditionalClassifiers.py
================================================
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import os
import pickle
import random
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


prefix = os.path.abspath(os.path.join(os.getcwd(), "."))
audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2)
audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0']
audio_dep_idxs_tmp = np.where(audio_targets == 1)[0]
audio_non_idxs = np.where(audio_targets == 0)[0]

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
#     y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

def standard_confusion_matrix(y_test, y_test_pred):
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True),
np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.65_2.npy'), allow_pickle=True),
np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)]
precs, recs, f1s = [], [], []
for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps):
    test_idxs_tmp = list(set(list(audio_dep_idxs_tmp)+list(audio_non_idxs)) - set(train_idxs_tmp))
    train_idxs, test_idxs = [], []
    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in audio_dep_idxs_tmp:
            feat = audio_features[idx]
            count = 0
            resample_idxs = [0,1,2,3,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                    audio_targets = np.hstack((audio_targets, 1))
                    train_idxs.append(len(audio_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in audio_dep_idxs_tmp:
            feat = audio_features[idx]
            count = 0
            # resample_idxs = random.sample(range(6), 4)
            resample_idxs = [0,1,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                    audio_targets = np.hstack((audio_targets, 1))
                    test_idxs.append(len(audio_features)-1)
                count += 1
        else:
            test_idxs.append(idx)

    X_train = audio_features[train_idxs]
    Y_train = audio_targets[train_idxs]
    X_test = audio_features[test_idxs]
    Y_test = audio_targets[test_idxs]

    # Decision Tree
    # from sklearn import tree
    # clf = tree.DecisionTreeClassifier(max_depth=20)

    # svm
    # from sklearn.svm import SVC
    # clf = SVC(kernel='sigmoid')

    # rf
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=50)

    # lr
    # from sklearn.linear_model import LogisticRegression
    # clf = LogisticRegression(solver='newton-cg')

    clf.fit([f.flatten() for f in X_train], Y_train)
    pred = clf.predict([f.flatten() for f in X_test])
    # clf.fit([f.sum(axis=0) for f in X_train], Y_train)
    # pred = clf.predict([f.sum(axis=0) for f in X_test])

    y_test_pred, conf_matrix = model_performance(Y_test, pred)

    # custom evaluation metrics
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)
    precs.append(0 if np.isnan(precision) else precision)
    recs.append(0 if np.isnan(recall) else recall)
    f1s.append(0 if np.isnan(f1_score) else f1_score)
    # precs.append(precision)
    # recs.append(recall)
    # f1s.append(f1_score)
print(np.mean(precs), np.mean(recs), np.mean(f1s))

================================================
FILE: DepressionCollected/Classification/FuseModelChecking.py
================================================
from fuse_net_whole import fusion_net, config, model_performance
import os
import numpy as np
import torch
from torch.autograd import Variable
import itertools

prefix = os.path.abspath(os.path.join(os.getcwd(), "./"))
idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy']
text_model_paths = ['BiLSTM_128_0.67_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.66_3.pt']
audio_model_paths = ['BiLSTM_gru_vlad256_256_0.63_1.pt', 'BiLSTM_gru_vlad256_256_0.65_2.pt', 'BiLSTM_gru_vlad256_256_0.60_3.pt']
fuse_model_paths = ['fuse_0.69_1.pt', 'fuse_0.68_2.pt', 'fuse_0.62_3.pt']
text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0']
text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0']
audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2)
audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0']
fuse_features = [[audio_features[i], text_features[i]] for i in range(text_features.shape[0])]
fuse_targets = text_targets
fuse_dep_idxs = np.where(text_targets == 1)[0]
fuse_non_idxs = np.where(text_targets == 0)[0]

def evaluate(model, test_idxs):
    model.eval()
    pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor)
    X_test = []
    Y_test = []
    for idx in test_idxs:
        X_test.append(fuse_features[idx])
        Y_test.append(fuse_targets[idx])
    global max_train_acc, max_acc,max_f1
    for i in range(0, len(X_test), config['batch_size']):
        if i + config['batch_size'] > len(X_test):
            x, y = X_test[i:], Y_test[i:]
        else:
            x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        text_feature, audio_feature = model.pretrained_feature(x)
        with torch.no_grad():
            # concat_x = torch.cat((audio_feature, text_feature), dim=1)
            audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std()
            text_feature_norm = (text_feature - text_feature.mean())/text_feature.std()
            concat_x = torch.cat((text_feature, audio_feature), dim=1)
            output = model(concat_x)
        pred = torch.cat((pred, output.data.max(1, keepdim=True)[1]))
        
    y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:])
    # custom evaluation metrics
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)

    return precision, recall, f1_score

ps, rs, fs = [], [], []
for fold in range(3):
    train_idxs_tmp = np.load(os.path.join(prefix, 'Features/TextWhole/{}'.format(idxs_paths[fold])), allow_pickle=True)
    test_idxs_tmp = list(set(list(fuse_dep_idxs)+list(fuse_non_idxs)) - set(train_idxs_tmp))
    resample_idxs = list(range(6))
    train_idxs, test_idxs = [], []
    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in fuse_dep_idxs:
            feat = fuse_features[idx]
            audio_perm = itertools.permutations(feat[0], 3)
            text_perm = itertools.permutations(feat[1], 3)
            count = 0
            for fuse_perm in zip(audio_perm, text_perm):
                if count in resample_idxs:
                    fuse_features.append(fuse_perm)
                    fuse_targets = np.hstack((fuse_targets, 1))
                    train_idxs.append(len(fuse_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in fuse_dep_idxs:
            feat = fuse_features[idx]
            audio_perm = itertools.permutations(feat[0], 3)
            text_perm = itertools.permutations(feat[1], 3)
            count = 0
            resample_idxs = [0,1,4,5]
            for fuse_perm in zip(audio_perm, text_perm):
                if count in resample_idxs:
                    fuse_features.append(fuse_perm)
                    fuse_targets = np.hstack((fuse_targets, 1))
                    test_idxs.append(len(fuse_features)-1)
                count += 1
        else:
            test_idxs.append(idx)
    
    fuse_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Fuse/{}'.format(fuse_model_paths[fold])))
    p, r, f = evaluate(fuse_model, test_idxs)
    ps.append(p)
    rs.append(r)
    fs.append(f)
print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs)))


================================================
FILE: DepressionCollected/Classification/TextModelChecking.py
================================================

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import wave
import re
import os
import tensorflow.compat.v1 as tf
import random
import itertools

from sklearn.preprocessing import StandardScaler
import pickle

# prefix = os.path.abspath(os.path.join(os.getcwd(), "."))
# text_features = np.load(os.path.join(prefix, 'Features/Text/whole_samples_clf_avg.npz'))['arr_0']
# text_targets = np.load(os.path.join(prefix, 'Features/Text/whole_labels_clf_avg.npz'))['arr_0']

# audio_dep_idxs = np.where(text_targets == 1)[0]
# audio_non_idxs = np.where(text_targets == 0)[0]
# # train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.80.npy'), allow_pickle=True)
# # train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.80.npy'), allow_pickle=True))
# # train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.65_2.npy'), allow_pickle=True)
# # train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.65_2.npy'), allow_pickle=True))
# train_dep_idxs_tmp = np.load(os.path.join(prefix, 'Features/Text/train_dep_idxs_0.89_3.npy'), allow_pickle=True)
# train_non_idxs = list(np.load(os.path.join(prefix, 'Features/Text/train_non_idxs_0.89_3.npy'), allow_pickle=True))

# test_dep_idxs_tmp = list(set(audio_dep_idxs) - set(train_dep_idxs_tmp))
# test_non_idxs = list(set(audio_non_idxs) - set(train_non_idxs))

prefix = os.path.abspath(os.path.join(os.getcwd(), "."))
text_features = np.load(os.path.join(
    prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0']
text_targets = np.load(os.path.join(
    prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0']
text_dep_idxs_tmp = np.where(text_targets == 1)[0]
text_non_idxs = np.where(text_targets == 0)[0]


# # training data augmentation
# train_dep_idxs = []
# for idx in train_dep_idxs_tmp:
#     feat = text_features[idx]
#     for i in itertools.permutations(feat, feat.shape[0]):
#         text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
#         text_targets = np.hstack((text_targets, 1))
#         train_dep_idxs.append(len(text_features)-1)

#         text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
#         text_targets = np.hstack((text_targets, 1))
#         train_dep_idxs.append(len(text_features)-1)

# # test data augmentation
# test_dep_idxs = []
# for idx in test_dep_idxs_tmp:
#     feat = text_features[idx]
#     for i in itertools.permutations(feat, feat.shape[0]):
#         text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
#         text_targets = np.hstack((text_targets, 1))
#         test_dep_idxs.append(len(text_features)-1)

def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])


def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
    # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix


class TextBiLSTM(nn.Module):
    def __init__(self, config):
        super(TextBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        self.init_weight()

    def init_weight(net):
        for name, param in net.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=self.bidirectional)

        # self.init_weight()

        # FC层
        # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes)
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1),
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):

        # x : [len_seq, batch_size, embedding_dim]
        x = x.permute(1, 0, 2)
        output, (final_hidden_state, final_cell_state) = self.lstm_net(x)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        return self.fc_out(atten_out)

class BiLSTM(nn.Module):
    def __init__(self, rnn_layers, dropout, num_classes, text_hidden_dims, text_embed_size):
        super(BiLSTM, self).__init__()

        self.text_embed_size = text_embed_size
        self.text_hidden_dims = text_hidden_dims
        self.rnn_layers = rnn_layers
        self.dropout = dropout
        self.num_classes = num_classes

        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.text_hidden_dims, self.text_hidden_dims),
            nn.ReLU(inplace=True)
        )

        # 双层lstm
        self.lstm_net = nn.LSTM(self.text_embed_size, self.text_hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=True)
        # FC层
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.text_hidden_dims, self.text_hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.text_hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1),
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x_text):
        # x : [len_seq, batch_size, embedding_dim]
        x_text = x_text.permute(1, 0, 2)
        output, (final_hidden_state, _) = self.lstm_net(x_text)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        text_feature = self.fc_out(atten_out)

        return text_feature

def evaluate(model, test_idxs):
    model.eval()
    batch_idx = 1
    total_loss = 0
    pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor)
    # X_test = text_features[test_dep_idxs+test_non_idxs]
    # Y_test = text_targets[test_dep_idxs+test_non_idxs]
    X_test = text_features[test_idxs]
    Y_test = text_targets[test_idxs]
    global max_train_acc, max_acc, max_f1
    for i in range(0, X_test.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_test.shape[0]:
            x, y = X_test[i:], Y_test[i:]
        else:
            x, y = X_test[i:(i+config['batch_size'])
                          ], Y_test[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(
            ),             Variable(torch.from_numpy(y)).cuda()
        else:
            x, y = Variable(torch.from_numpy(x).type(
                torch.FloatTensor), requires_grad=True), Variable(torch.from_numpy(y))
        with torch.no_grad():
            output = model(x.squeeze(2))
        pred = torch.cat((pred, output.data.max(1, keepdim=True)[1]))

    y_test_pred, conf_matrix = model_performance(
        Y_test, pred[config['batch_size']:])
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] +
                     conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / \
        (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)
    return precision, recall, f1_score

text_model_paths = ['BiLSTM_128_0.64_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.66_3.pt']
train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True),
                   np.load(os.path.join(
                       prefix, 'Features/TextWhole/train_idxs_0.60_2.npy'), allow_pickle=True),
                   np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)]
resample_idxs = [0, 1, 2, 3, 4, 5]
fold = 1
ps, rs, fs = [], [], []
for idx_i, train_idxs_tmp in enumerate(train_idxs_tmps):
    test_idxs_tmp = list(
        set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp))
    train_idxs, test_idxs = [], []
    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack(
                        (text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    train_idxs.append(len(text_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            # resample_idxs = random.sample(range(6), 4)
            resample_idxs = [0,1,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack(
                        (text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    test_idxs.append(len(text_features)-1)
                count += 1
        else:
            test_idxs.append(idx)

    config = {
        'num_classes': 2,
        'dropout': 0.5,
        'rnn_layers': 2,
        'embedding_size': 1024,
        'batch_size': 4,
        'epochs': 100,
        'learning_rate': 2e-5,
        'hidden_dims': 128,
        'bidirectional': True,
        'cuda': False,
    }

    text_lstm_model = torch.load(os.path.join(
        prefix, 'Model/ClassificationWhole/Text/{}'.format(text_model_paths[idx_i])))

    model = BiLSTM(config['rnn_layers'], config['dropout'], config['num_classes'],
                   config['hidden_dims'], config['embedding_size'])

    # model_state_dict = {}
    # model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0']
    # model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0']
    # model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0']
    # model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0']

    # model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1']
    # model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1']
    # model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1']
    # model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1']

    # model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight']
    # model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias']
    # model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight']
    # model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias']
    # model_state_dict = text_lstm_model.state_dict()
    # model.load_state_dict(model_state_dict)

    # evaluate(text_features_test, fuse_targets_test, audio_lstm_model)
    # evaluate(model, test_idxs)
    
    p, r, f = evaluate(text_lstm_model, test_idxs)
    ps.append(p)
    rs.append(r)
    fs.append(f)
print('precison: {} \n recall: {} \n f1 score: {}'.format(np.mean(ps), np.mean(rs), np.mean(fs)))


================================================
FILE: DepressionCollected/Classification/TextTraditionalClassifiers.py
================================================
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import os
import pickle
import random
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


prefix = os.path.abspath(os.path.join(os.getcwd(), "."))
text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0']
text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0']
text_dep_idxs_tmp = np.where(text_targets == 1)[0]
text_non_idxs = np.where(text_targets == 0)[0]

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
#     y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

def standard_confusion_matrix(y_test, y_test_pred):
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True),
np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.65_2.npy'), allow_pickle=True),
np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)]
precs, recs, f1s = [], [], []

for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps):
    test_idxs_tmp = list(set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp))
    train_idxs, test_idxs = [], []

    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            resample_idxs = [0,1,2,3,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    train_idxs.append(len(text_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            # resample_idxs = random.sample(range(6), 4)
            resample_idxs = [0,1,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    test_idxs.append(len(text_features)-1)
                count += 1
        else:
            test_idxs.append(idx)
    # train_idxs = train_idxs_tmp
    # test_idxs = test_idxs_tmp

    X_train = text_features[train_idxs]
    Y_train = text_targets[train_idxs]
    X_test = text_features[test_idxs]
    Y_test = text_targets[test_idxs]

    # Decision Tree
    from sklearn import tree
    clf = tree.DecisionTreeClassifier(max_depth=20)

    # svm
    # from sklearn.svm import SVC
    # clf = SVC(kernel='rbf', gamma='auto')

    # rf
    # from sklearn.ensemble import RandomForestClassifier
    # clf = RandomForestClassifier(n_estimators=10, max_depth=20)

    # lr
    # from sklearn.linear_model import LogisticRegression
    # clf = LogisticRegression()

    clf.fit([f.flatten() for f in X_train], Y_train)
    pred = clf.predict([f.flatten() for f in X_test])
    # clf.fit([f.sum(axis=0) for f in X_train], Y_train)
    # pred = clf.predict([f.sum(axis=0) for f in X_test])

    y_test_pred, conf_matrix = model_performance(Y_test, pred)

    # custom evaluation metrics
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)
    # precs.append(0 if np.isnan(precision) else precision)
    # recs.append(0 if np.isnan(recall) else recall)
    # f1s.append(0 if np.isnan(f1_score) else f1_score)
    precs.append(precision)
    recs.append(recall)
    f1s.append(f1_score)
print(np.mean(precs), np.mean(recs), np.mean(f1s))

================================================
FILE: DepressionCollected/Classification/audio_features_whole.py
================================================
import os
import numpy as np
import pandas as pd
import wave
import librosa
from python_speech_features import *
import sys
import pickle
sys.path.append('/Users/linlin/Desktop/depression/classfication')

import tensorflow.compat.v1 as tf

import vggish.vggish_input as vggish_input
import vggish.vggish_params as vggish_params
import vggish.vggish_postprocess as vggish_postprocess
import vggish.vggish_slim as vggish_slim

import loupe_keras as lpk

from allennlp.commands.elmo import ElmoEmbedder

tf.enable_eager_execution()

elmo = ElmoEmbedder()

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

prefix = os.path.abspath(os.path.join(os.getcwd(), "."))

# Paths to downloaded VGGish files.
checkpoint_path =os.path.join(os.getcwd(),  'vggish/vggish_model.ckpt')
pca_params_path = os.path.join(os.getcwd(), 'vggish/vggish_pca_params.npz')

cluster_size = 16

min_len = 100
max_len = -1

def to_vggish_embedds(x, sr):
    # x为输入的音频，sr为sample_rate
    input_batch = vggish_input.waveform_to_examples(x, sr)
    with tf.Graph().as_default(), tf.Session() as sess:
      vggish_slim.define_vggish_slim()
      vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

      features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
      embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
      [embedding_batch] = sess.run([embedding_tensor],
                                   feed_dict={features_tensor: input_batch})

    # Postprocess the results to produce whitened quantized embeddings.
    pproc = vggish_postprocess.Postprocessor(pca_params_path)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    
    return tf.cast(postprocessed_batch, dtype='float32')

def wav2vlad(wave_data, sr):
    global cluster_size
    signal = wave_data
    melspec = librosa.feature.melspectrogram(signal, n_mels=80,sr=sr).astype(np.float32).T
    melspec = np.log(np.maximum(1e-6, melspec))
    feature_size = melspec.shape[1]
    max_samples = melspec.shape[0]
    output_dim = cluster_size * 16
    feat = lpk.NetVLAD(feature_size=feature_size, max_samples=max_samples, \
                            cluster_size=cluster_size, output_dim=output_dim) \
                                (tf.convert_to_tensor(melspec))
    with tf.Session() as sess:
        init = tf.global_variables_initializer()
        sess.run(init)
        r = feat.numpy()
    return r
        
def extract_features(number, audio_features, targets, path):
    global max_len, min_len
    if not os.path.exists(os.path.join(prefix, '{1}/{0}/positive_out.wav'.format(number, path))):
        return    
    positive_file = wave.open(os.path.join(prefix, '{1}/{0}/positive_out.wav'.format(number, path)))
    sr1 = positive_file.getframerate()
    nframes1 = positive_file.getnframes()
    wave_data1 = np.frombuffer(positive_file.readframes(nframes1), dtype=np.short).astype(np.float)
    len1 = nframes1 / sr1

    neutral_file = wave.open(os.path.join(prefix, '{1}/{0}/neutral_out.wav'.format(number, path)))
    sr2 = neutral_file.getframerate()
    nframes2 = neutral_file.getnframes()
    wave_data2 = np.frombuffer(neutral_file.readframes(nframes2), dtype=np.short).astype(np.float)
    len2 = nframes2 / sr2

    negative_file = wave.open(os.path.join(prefix, '{1}/{0}/negative_out.wav'.format(number, path)))
    sr3 = negative_file.getframerate()
    nframes3 = negative_file.getnframes()
    wave_data3 = np.frombuffer(negative_file.readframes(nframes3), dtype=np.short).astype(np.float)
    len3 = nframes3/sr3

    for l in [len1, len2, len3]:
        if l > max_len:
            max_len = l
        if l < min_len:
            min_len = l

    with open(os.path.join(prefix, '{1}/{0}/new_label.txt'.format(number, path))) as fli:
        target = float(fli.readline())
    
    if wave_data1.shape[0] < 1:
        wave_data1 = np.array([1e-4]*sr1*5)
    if wave_data2.shape[0] < 1:
        wave_data2 = np.array([1e-4]*sr2*5)
    if wave_data3.shape[0] < 1:
        wave_data3 = np.array([1e-4]*sr3*5)  
    audio_features.append([wav2vlad(wave_data1, sr1), wav2vlad(wave_data2, sr2), \
        wav2vlad(wave_data3, sr3)])
    # targets.append(1 if target >= 53 else 0)
    targets.append(target)


audio_features = []
audio_targets = []

for index in range(114):
    extract_features(index+1, audio_features, audio_targets, 'Data')

for index in range(114):
    extract_features(index+1, audio_features, audio_targets, 'ValidationData')


print("Saving npz file locally...")
np.savez(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_%d.npz'%(cluster_size*16)), audio_features)
np.savez(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_%d.npz')%(cluster_size*16), audio_targets)

print(max_len, min_len)

================================================
FILE: DepressionCollected/Classification/audio_gru_whole.py
================================================
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import numpy as np
import pandas as pd
import os
import pickle
import random
import itertools

prefix = os.path.abspath(os.path.join(os.getcwd(), "."))
audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2)
audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0']
audio_dep_idxs_tmp = np.where(audio_targets == 1)[0]
audio_non_idxs = np.where(audio_targets == 0)[0]

class AudioBiLSTM(nn.Module):
    def __init__(self, config):
        super(AudioBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        # self.init_weight()

    def init_weight(net):
        for name, param in net.named_parameters():
            if not 'ln' in name:
                if 'bias' in name:
                    nn.init.constant_(param, 0.0)
                elif 'weight' in name:
                    nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True))
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # self.lstm_net_audio = nn.LSTM(self.embedding_size,
        #                         self.hidden_dims,
        #                         num_layers=self.rnn_layers,
        #                         dropout=self.dropout,
        #                         bidirectional=self.bidirectional,
        #                         batch_first=True)
        self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True)

        self.ln = nn.LayerNorm(self.embedding_size)

        # FC层
        self.fc_audio = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1)
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        #         h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
       # print(atten_w.shape, m.transpose(1, 2).shape)
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        x = self.ln(x)
        x, _ = self.lstm_net_audio(x)
        x = x.mean(dim=1)
        out = self.fc_audio(x)
        return out

config = {
    'num_classes': 2,
    'dropout': 0.5,
    'rnn_layers': 2,
    'embedding_size': 256,
    'batch_size': 8,
    'epochs': 170,
    'learning_rate': 6e-6,
    'hidden_dims': 256,
    'bidirectional': False,
    'cuda': False
}

def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)

def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test.cpu().numpy(), y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
    y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred.numpy())
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

def train(epoch, train_idxs):
    global lr, train_acc
    model.train()
    batch_idx = 1      
    total_loss = 0
    correct = 0
    pred = np.array([])
    X_train = audio_features[train_idxs]
    Y_train = audio_targets[train_idxs]
    for i in range(0, X_train.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_train.shape[0]:
            x, y = X_train[i:], Y_train[i:]
        else:
            x, y = X_train[i:(i + config['batch_size'])], Y_train[i:(
                i + config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        else:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(y))

        # 将模型的参数梯度设置为0
        optimizer.zero_grad()
        output = model(x)
        pred = output.data.max(1, keepdim=True)[1]
        #print(pred.shape, y.shape)
        correct += pred.eq(y.data.view_as(pred)).cpu().sum()
        loss = criterion(output, y)
        # 后向传播调整参数
        loss.backward()
        # 根据梯度更新网络参数
        optimizer.step()
        batch_idx += 1
        # loss.item()能够得到张量中的元素值
        total_loss += loss.item()

    train_acc = correct
    print(
        'Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n '
        .format(epoch + 1, config['learning_rate'], total_loss, correct,
                X_train.shape[0], 100. * correct / X_train.shape[0]))


def evaluate(model, test_idxs, fold, train_idxs_tmp, train_idxs):
    model.eval()
    batch_idx = 1
    total_loss = 0
    global max_f1, max_acc, min_mae, X_test_lens, max_prec, max_rec
    pred = np.array([])
    with torch.no_grad():
        if config['cuda']:
            x, y = Variable(torch.from_numpy(audio_features[test_idxs]).type(torch.FloatTensor), requires_grad=True).cuda(),\
                Variable(torch.from_numpy(audio_targets[test_idxs])).cuda()
        else:
            x, y = Variable(torch.from_numpy(audio_features[test_idxs]).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(audio_targets[test_idxs])).type(torch.LongTensor)

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        total_loss += loss.item()
        y_test_pred, conf_matrix = model_performance(y, output.cpu())
        accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
        precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
        recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
        f1_score = 2 * (precision * recall) / (precision + recall)
        print("Accuracy: {}".format(accuracy))
        print("Precision: {}".format(precision))
        print("Recall: {}".format(recall))
        print("F1-Score: {}\n".format(f1_score))
        print('=' * 89)

        if max_f1 <= f1_score and train_acc > len(train_idxs)*0.90  and f1_score > 0.5:
            max_f1 = f1_score
            max_acc = accuracy
            max_rec = recall
            max_prec = precision
            mode ='gru'
            save(model, os.path.join(prefix, 'Model/ClassificationWhole/Audio/BiLSTM_{}_vlad{}_{}_{:.2f}_{}'.format(mode, config['embedding_size'], config['hidden_dims'], max_f1, fold)))
            np.save(os.path.join(prefix, 'Features/TextWhole/train_idxs_{:.2f}_{}.npy'.format(f1_score, fold)), train_idxs_tmp)
            print('*' * 64)
            print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc))
            print('*' * 64)

    return total_loss

def get_param_group(model):
    nd_list = []
    param_list = []
    for name, param in model.named_parameters():
        if 'ln' in name:
            nd_list.append(param)
        else:
            param_list.append(param)
    return [{'params': param_list, 'weight_decay': 1e-5}, {'params': nd_list, 'weight_decay': 0}]

if __name__ == '__main__':
    # kf = KFold(n_splits=3, shuffle=True)
    # fold = 1
    # for train_idxs_tmp, test_idxs_tmp in kf.split(audio_features):
    train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True),
    np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_2.npy'), allow_pickle=True),
    np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)]
    for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps):
        fold = idx_idx + 1
        # if idx_idx != 1:
        #     continue
        test_idxs_tmp = list(set(list(audio_dep_idxs_tmp)+list(audio_non_idxs)) - set(train_idxs_tmp))
        train_idxs, test_idxs = [], []
        resample_idxs = [0,1,2,3,4,5]
        # depression data augmentation
        for idx in train_idxs_tmp:
            if idx in audio_dep_idxs_tmp:
                feat = audio_features[idx]
                count = 0
                for i in itertools.permutations(feat, feat.shape[0]):
                    if count in resample_idxs:
                        audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                        audio_targets = np.hstack((audio_targets, 1))
                        train_idxs.append(len(audio_features)-1)
                    count += 1
            else:
                train_idxs.append(idx)

        for idx in test_idxs_tmp:
            if idx in audio_dep_idxs_tmp:
                feat = audio_features[idx]
                count = 0
                # resample_idxs = random.sample(range(6), 4)
                resample_idxs = [0,1,4,5]
                for i in itertools.permutations(feat, feat.shape[0]):
                    if count in resample_idxs:
                        audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                        audio_targets = np.hstack((audio_targets, 1))
                        test_idxs.append(len(audio_features)-1)
                    count += 1
            else:
                test_idxs.append(idx)
            # test_idxs.append(idx)

        model = AudioBiLSTM(config)

        if config['cuda']:
            model = model.cuda()

        param_group = get_param_group(model)
        optimizer = optim.AdamW(param_group, lr=config['learning_rate'])
        criterion = nn.CrossEntropyLoss()
        # criterion = FocalLoss(class_num=2)
        max_f1 = -1
        max_acc = -1
        max_rec = -1
        max_prec = -1
        train_acc = -1

        for ep in range(1, config['epochs']):
            train(ep, train_idxs)
            tloss = evaluate(model, test_idxs, fold, train_idxs_tmp, train_idxs)
        fold += 1

================================================
FILE: DepressionCollected/Classification/fuse_net_whole.py
================================================

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import wave
import librosa
from python_speech_features import *
import re
from allennlp.commands.elmo import ElmoEmbedder
import os
import tensorflow.compat.v1 as tf
import itertools

prefix = os.path.abspath(os.path.join(os.getcwd(), "./"))

text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0']
text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0']
audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_clf_256.npz'))['arr_0'], axis=2)
audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_clf_256.npz'))['arr_0']
fuse_features = [[audio_features[i], text_features[i]] for i in range(text_features.shape[0])]
fuse_targets = text_targets

fuse_dep_idxs = np.where(text_targets == 1)[0]
fuse_non_idxs = np.where(text_targets == 0)[0]

def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)
    
def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
    # y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

class TextBiLSTM(nn.Module):
    def __init__(self, config):
        super(TextBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        self.init_weight()
        
    def init_weight(net):
        for name, param in net.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=self.bidirectional)
        
        # self.init_weight()
        
        # FC层
        # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes)
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1),
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        
        # x : [len_seq, batch_size, embedding_dim]
        x = x.permute(1, 0, 2)
        output, (final_hidden_state, final_cell_state) = self.lstm_net(x)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        return self.fc_out(atten_out)

class AudioBiLSTM(nn.Module):
    def __init__(self, config):
        super(AudioBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        # self.init_weight()

    def init_weight(net):
        for name, param in net.named_parameters():
            if not 'ln' in name:
                if 'bias' in name:
                    nn.init.constant_(param, 0.0)
                elif 'weight' in name:
                    nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True))
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # self.lstm_net_audio = nn.LSTM(self.embedding_size,
        #                         self.hidden_dims,
        #                         num_layers=self.rnn_layers,
        #                         dropout=self.dropout,
        #                         bidirectional=self.bidirectional,
        #                         batch_first=True)
        self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True)

        self.ln = nn.LayerNorm(self.embedding_size)

        # FC层
        self.fc_audio = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1)
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        #         h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
       # print(atten_w.shape, m.transpose(1, 2).shape)
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        x = self.ln(x)
        x, _ = self.lstm_net_audio(x)
        x = x.mean(dim=1)
        out = self.fc_audio(x)
        return out

class fusion_net(nn.Module):
    def __init__(self, text_embed_size, text_hidden_dims, rnn_layers, dropout, num_classes, \
         audio_hidden_dims, audio_embed_size):
        super(fusion_net, self).__init__()
        self.text_embed_size = text_embed_size
        self.audio_embed_size = audio_embed_size
        self.text_hidden_dims = text_hidden_dims
        self.audio_hidden_dims = audio_hidden_dims
        self.rnn_layers = rnn_layers
        self.dropout = dropout
        self.num_classes = num_classes
        
        # ============================= TextBiLSTM =================================
        
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.text_hidden_dims, self.text_hidden_dims),
            nn.ReLU(inplace=True)
        )

        # 双层lstm
        self.lstm_net = nn.LSTM(self.text_embed_size, self.text_hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=True)
        # FC层
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.text_hidden_dims, self.text_hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout)
        )
        
        # ============================= TextBiLSTM =================================

        # ============================= AudioBiLSTM =============================

        self.lstm_net_audio = nn.GRU(self.audio_embed_size,
                                self.audio_hidden_dims,
                                num_layers=self.rnn_layers,
                                dropout=self.dropout,
                                bidirectional=False,
                                batch_first=True)

        self.fc_audio = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.audio_hidden_dims, self.audio_hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout)
        )

        self.ln = nn.LayerNorm(self.audio_embed_size)
        
        # ============================= AudioBiLSTM =============================

        # ============================= last fc layer =============================
        # self.bn = nn.BatchNorm1d(self.text_hidden_dims + self.audio_hidden_dims)
        # modal attention
        self.modal_attn = nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.text_hidden_dims + self.audio_hidden_dims, bias=False)
        self.fc_final = nn.Sequential(
            nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.num_classes, bias=False),
            # nn.ReLU(),
            nn.Softmax(dim=1),
            # nn.Sigmoid()
        )
        
    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result
    
    def pretrained_feature(self, x):
        with torch.no_grad():
            x_text = []
            x_audio = []
            for ele in x:
                x_text.append(ele[1])
                x_audio.append(ele[0])
            x_text, x_audio = Variable(torch.tensor(x_text).type(torch.FloatTensor), requires_grad=False), Variable(torch.tensor(x_audio).type(torch.FloatTensor), requires_grad=False)
            # ============================= TextBiLSTM =================================
            # x : [len_seq, batch_size, embedding_dim]
            x_text = x_text.permute(1, 0, 2)
            output, (final_hidden_state, _) = self.lstm_net(x_text)
            # output : [batch_size, len_seq, n_hidden * 2]
            output = output.permute(1, 0, 2)
            # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
            final_hidden_state = final_hidden_state.permute(1, 0, 2)
            # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
            # atten_out = self.attention_net(output, final_hidden_state)
            atten_out = self.attention_net_with_w(output, final_hidden_state)
            text_feature = self.fc_out(atten_out)

            # ============================= TextBiLSTM =================================

            # ============================= AudioBiLSTM =============================
            x_audio = self.ln(x_audio)
            x_audio, _ = self.lstm_net_audio(x_audio)
            x_audio = x_audio.sum(dim=1)
            audio_feature = self.fc_audio(x_audio)

        # ============================= AudioBiLSTM =============================
        return (text_feature, audio_feature)
        
    def forward(self, x): 
        # x = self.bn(x)
        # modal_weights = torch.softmax(self.modal_attn(x), dim=1)
        # modal_weights = self.modal_attn(x)
        # x = (modal_weights * x)
        output = self.fc_final(x)
        return output
    
class MyLoss(nn.Module):
    def __init__(self):
        super(MyLoss, self).__init__()
        
    def forward(self, text_feature, audio_feature, target, model):
        weight = model.fc_final[0].weight
        # bias = model.fc_final[0].bias
        # print(weight, bias)
        pred_text = F.linear(text_feature, weight[:, :config['text_hidden_dims']])
        pred_audio = F.linear(audio_feature, weight[:, config['text_hidden_dims']:])
        l = nn.CrossEntropyLoss()
        target = torch.tensor(target)
        # l = nn.BCEWithLogitsLoss()
        # target = F.one_hot(target, num_classes=2).type(torch.FloatTensor)
        # print('y: {}\npred_audio: {}\npred_text: {}\n'.format(target, pred_audio.data.max(1, keepdim=True)[1], pred_text.data.max(1, keepdim=True)[1]))
        # return l(pred_text, target) + l(pred_audio, target) + \
        #         config['lambda']*torch.norm(weight[:, :config['text_hidden_dims']]) + \
        #         config['lambda']*torch.norm(weight[:, config['text_hidden_dims']:])  
        # a = F.softmax(pred_text, dim=1) + F.softmax(pred_audio, dim=1)
        return l(pred_text, target) + l(pred_audio, target)
    

config = {
    'num_classes': 2,
    'dropout': 0.3,
    'rnn_layers': 2,
    'audio_embed_size': 256,
    'text_embed_size': 1024,
    'batch_size': 2,
    'epochs': 100,
    'learning_rate': 8e-6,
    'audio_hidden_dims': 256,
    'text_hidden_dims': 128,
    'cuda': False,
    'lambda': 1e-5,
}

model = fusion_net(config['text_embed_size'], config['text_hidden_dims'], config['rnn_layers'], \
    config['dropout'], config['num_classes'], config['audio_hidden_dims'], config['audio_embed_size'])

optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
# optimizer = optim.Adam(model.parameters())
# criterion = nn.CrossEntropyLoss()
criterion = MyLoss()

def train(epoch, train_idxs):
    global max_train_acc, train_acc
    model.train()
    batch_idx = 1
    total_loss = 0
    correct = 0
    X_train = []
    Y_train = []
    for idx in train_idxs:
        X_train.append(fuse_features[idx])
        Y_train.append(fuse_targets[idx])
    for i in range(0, len(X_train), config['batch_size']):
        if i + config['batch_size'] > len(X_train):
            x, y = X_train[i:], Y_train[i:]
        else:
            x, y = X_train[i:(i+config['batch_size'])], Y_train[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        # 将模型的参数梯度设置为0
        optimizer.zero_grad()
        text_feature, audio_feature = model.pretrained_feature(x)
        # text_feature = torch.from_numpy(ss.fit_transform(text_feature.numpy()))
        # audio_feature = torch.from_numpy(ss.fit_transform(audio_feature.numpy()))
        # concat_x = torch.cat((audio_feature, text_feature), dim=1)
        concat_x = torch.cat((text_feature, audio_feature), dim=1)
        # dot_x = text_feature.mul(audio_feature)
        # add_x = text_feature.add(audio_feature)
        output = model(concat_x)
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(torch.tensor(y).data.view_as(pred)).cpu().sum()
        # loss = criterion(output, torch.tensor(y))
        loss = criterion(text_feature, audio_feature, y, model)
        # 后向传播调整参数
        loss.backward()
        # 根据梯度更新网络参数
        optimizer.step()
        batch_idx += 1
        # loss.item()能够得到张量中的元素值
        total_loss += loss.item()
    cur_loss = total_loss
    max_train_acc = correct
    train_acc = correct
    print('Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n '.format(
                epoch, config['learning_rate'], cur_loss/len(X_train), correct, len(X_train),
        100. * correct / len(X_train)))


def evaluate(model, test_idxs, fold, train_idxs):
    model.eval()
    batch_idx = 1
    total_loss = 0
    pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor)
    X_test = []
    Y_test = []
    for idx in test_idxs:
        X_test.append(fuse_features[idx])
        Y_test.append(fuse_targets[idx])
    global max_train_acc, max_acc,max_f1
    for i in range(0, len(X_test), config['batch_size']):
        if i + config['batch_size'] > len(X_test):
            x, y = X_test[i:], Y_test[i:]
        else:
            x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        text_feature, audio_feature = model.pretrained_feature(x)
        with torch.no_grad():
            # concat_x = torch.cat((audio_feature, text_feature), dim=1)
            audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std()
            text_feature_norm = (text_feature - text_feature.mean())/text_feature.std()
            concat_x = torch.cat((text_feature, audio_feature), dim=1)
            output = model(concat_x)
        # loss = criterion(output, torch.tensor(y))
        loss = criterion(text_feature, audio_feature, y, model)
        pred = torch.cat((pred, output.data.max(1, keepdim=True)[1]))
        total_loss += loss.item()
        
    y_test_pred, conf_matrix = model_performance(Y_test, pred[config['batch_size']:])
    
    print('\nTest set: Average loss: {:.4f}'.format(total_loss/len(X_test)))
    # custom evaluation metrics
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)
    
    if max_f1 < f1_score and max_train_acc >= len(train_idxs)*0.9 and f1_score > 0.61:
        max_f1 = f1_score
        max_acc = accuracy
        save(model, os.path.join(prefix, 'Model/ClassificationWhole/Fuse/fuse_{:.2f}_{}'.format(max_f1, fold)))
        print('*'*64)
        print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc))
        print('*'*64)
    return total_loss

if __name__ == '__main__':
    idxs_paths = ['train_idxs_0.63_1.npy', 'train_idxs_0.65_2.npy', 'train_idxs_0.60_3.npy']
    text_model_paths = ['BiLSTM_128_0.64_1.pt', 'BiLSTM_128_0.66_2.pt', 'BiLSTM_128_0.62_3.pt']
    audio_model_paths = ['BiLSTM_gru_vlad256_256_0.67_1.pt', 'BiLSTM_gru_vlad256_256_0.67_2.pt', 'BiLSTM_gru_vlad256_256_0.63_3.pt']
    for fold in range(1, 4):
        # if fold != 2:
        #     continue
        train_idxs_tmp = np.load(os.path.join(prefix, 'Features/TextWhole/{}'.format(idxs_paths[fold-1])), allow_pickle=True)
        test_idxs_tmp = list(set(list(fuse_dep_idxs)+list(fuse_non_idxs)) - set(train_idxs_tmp))
        resample_idxs = list(range(6))

        train_idxs, test_idxs = [], []
        # depression data augmentation
        for idx in train_idxs_tmp:
            if idx in fuse_dep_idxs:
                feat = fuse_features[idx]
                audio_perm = itertools.permutations(feat[0], 3)
                text_perm = itertools.permutations(feat[1], 3)
                count = 0
                for fuse_perm in zip(audio_perm, text_perm):
                    if count in resample_idxs:
                        fuse_features.append(fuse_perm)
                        fuse_targets = np.hstack((fuse_targets, 1))
                        train_idxs.append(len(fuse_features)-1)
                    count += 1
            else:
                train_idxs.append(idx)

        for idx in test_idxs_tmp:
            if idx in fuse_dep_idxs:
                feat = fuse_features[idx]
                audio_perm = itertools.permutations(feat[0], 3)
                text_perm = itertools.permutations(feat[1], 3)
                count = 0
                resample_idxs = [0,1,4,5]
                for fuse_perm in zip(audio_perm, text_perm):
                    if count in resample_idxs:
                        fuse_features.append(fuse_perm)
                        fuse_targets = np.hstack((fuse_targets, 1))
                        test_idxs.append(len(fuse_features)-1)
                    count += 1
            else:
                test_idxs.append(idx)

        text_lstm_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Text/{}'.format(text_model_paths[fold-1])))
        audio_lstm_model = torch.load(os.path.join(prefix, 'Model/ClassificationWhole/Audio/{}'.format(audio_model_paths[fold-1])))
        model_state_dict = {}
        model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0']
        model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0']
        model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0']
        model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0']

        model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1']
        model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1']
        model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1']
        model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1']

        model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight']
        model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias']
        model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight']
        model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias']

        model_state_dict['ln.weight'] = audio_lstm_model.state_dict()['ln.weight']
        model_state_dict['ln.bias'] = audio_lstm_model.state_dict()['ln.bias']
        model.load_state_dict(text_lstm_model.state_dict(), strict=False)
        # model.load_state_dict(audio_lstm_model.state_dict(), strict=False)
        model.load_state_dict(model_state_dict, strict=False)
            
        for param in model.parameters():
            param.requires_grad = False

        model.fc_final[0].weight.requires_grad = True
        # model.fc_final[0].bias.requires_grad = True
        # model.modal_attn.weight.requires_grad = True

        max_f1 = -1
        max_acc = -1
        max_train_acc = -1

        for ep in range(1, config['epochs']):
            train(ep, train_idxs)
            tloss = evaluate(model, test_idxs, fold, train_idxs)

================================================
FILE: DepressionCollected/Classification/text_bilstm_whole.py
================================================
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import os
import pickle
import random
import itertools

prefix = os.path.abspath(os.path.join(os.getcwd(), "."))
text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_clf_avg.npz'))['arr_0']
text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_clf_avg.npz'))['arr_0']
text_dep_idxs_tmp = np.where(text_targets == 1)[0]
text_non_idxs = np.where(text_targets == 0)[0]

class TextBiLSTM(nn.Module):
    def __init__(self, config):
        super(TextBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        self.init_weight()
        
    def init_weight(net):
        for name, param in net.named_parameters():
            if 'ln' not in name:
                if 'bias' in name:
                    nn.init.constant_(param, 0.0)
                elif 'weight' in name:
                    nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=self.bidirectional)
                
        # FC层
        # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes)
        self.fc_out = nn.Sequential(
            # nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1),
        )

        self.ln1 = nn.LayerNorm(self.embedding_size)
        self.ln2 = nn.LayerNorm(self.hidden_dims)


    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        # x : [len_seq, batch_size, embedding_dim]
        x = x.permute(1, 0, 2)
        # x = self.ln1(x)
        output, (final_hidden_state, _) = self.lstm_net(x)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        # atten_out = self.ln2(atten_out)
        return self.fc_out(atten_out)

def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)
    
def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
    y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

def train(epoch, train_idxs):
    global lr, train_acc
    model.train()
    batch_idx = 1
    total_loss = 0
    correct = 0
    X_train = text_features[train_idxs]
    Y_train = text_targets[train_idxs]
    for i in range(0, X_train.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_train.shape[0]:
            x, y = X_train[i:], Y_train[i:]
        else:
            x, y = X_train[i:(i + config['batch_size'])], Y_train[i:(
                i + config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        else:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(y))

        # 将模型的参数梯度设置为0
        optimizer.zero_grad()
        output = model(x)
        pred = output.data.max(1, keepdim=True)[1]
        #print(pred.shape, y.shape)
        correct += pred.eq(y.data.view_as(pred)).cpu().sum()
        loss = criterion(output, y)
        # 后向传播调整参数
        loss.backward()
        # 根据梯度更新网络参数
        optimizer.step()
        batch_idx += 1
        # loss.item()能够得到张量中的元素值
        total_loss += loss.item()

    train_acc = correct
    print(
        'Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n '
        .format(epoch + 1, config['learning_rate'], total_loss, correct,
                X_train.shape[0], 100. * correct / X_train.shape[0]))


def evaluate(model, test_idxs, fold, train_idxs):
    model.eval()
    batch_idx = 1
    total_loss = 0
    global max_f1, max_acc, min_mae, X_test_lens, max_prec, max_rec
    pred = np.array([])
    with torch.no_grad():
        if config['cuda']:
            x, y = Variable(torch.from_numpy(text_features[test_idxs]).type(torch.FloatTensor), requires_grad=True).cuda(),\
                Variable(torch.from_numpy(text_targets[test_idxs])).cuda()
        else:
            x, y = Variable(torch.from_numpy(text_features[test_idxs]).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(text_targets[test_idxs])).type(torch.LongTensor)

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        total_loss += loss.item()
        y_test_pred, conf_matrix = model_performance(y, output.cpu())
        accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
        precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
        recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
        f1_score = 2 * (precision * recall) / (precision + recall)
        print("Accuracy: {}".format(accuracy))
        print("Precision: {}".format(precision))
        print("Recall: {}".format(recall))
        print("F1-Score: {}\n".format(f1_score))
        print('=' * 89)

        if max_f1 <= f1_score and train_acc > len(train_idxs)*0.9 and f1_score > 0.5:
            max_f1 = f1_score
            max_acc = accuracy
            max_rec = recall
            max_prec = precision
            save(model, os.path.join(prefix, 'Model/ClassificationWhole/Text/BiLSTM_{}_{:.2f}_{}'.format(config['hidden_dims'], max_f1, fold)))
            print('*' * 64)
            print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc))
            print('*' * 64)

    return total_loss

def get_param_group(model):
    nd_list = []
    param_list = []
    for name, param in model.named_parameters():
        if 'ln' in name:
            nd_list.append(param)
        else:
            param_list.append(param)
    return [{'params': param_list, 'weight_decay': 1e-5}, {'params': nd_list, 'weight_decay': 0}]

config = {
    'num_classes': 2,
    'dropout': 0.5,
    'rnn_layers': 2,
    'embedding_size': 1024,
    'batch_size': 4,
    'epochs': 150,
    'learning_rate': 1e-5,
    'hidden_dims': 128,
    'bidirectional': True,
    'cuda': False,
}

train_idxs_tmps = [np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.63_1.npy'), allow_pickle=True),
np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_2.npy'), allow_pickle=True),
np.load(os.path.join(prefix, 'Features/TextWhole/train_idxs_0.60_3.npy'), allow_pickle=True)]
fold = 1

for idx_idx, train_idxs_tmp in enumerate(train_idxs_tmps):
    # if idx_idx != 2:
    #     continue
    test_idxs_tmp = list(set(list(text_dep_idxs_tmp)+list(text_non_idxs)) - set(train_idxs_tmp))
    train_idxs, test_idxs = [], []
    # depression data augmentation
    for idx in train_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            resample_idxs = [0,1,2,3,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    train_idxs.append(len(text_features)-1)
                count += 1
        else:
            train_idxs.append(idx)

    for idx in test_idxs_tmp:
        if idx in text_dep_idxs_tmp:
            feat = text_features[idx]
            count = 0
            # resample_idxs = random.sample(range(6), 4)
            resample_idxs = [0,1,4,5]
            for i in itertools.permutations(feat, feat.shape[0]):
                if count in resample_idxs:
                    text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
                    text_targets = np.hstack((text_targets, 1))
                    test_idxs.append(len(text_features)-1)
                count += 1
        else:
            test_idxs.append(idx)

    model = TextBiLSTM(config)

    param_group = get_param_group(model)
    optimizer = optim.AdamW(param_group, lr=config['learning_rate'])
    criterion = nn.CrossEntropyLoss()
    max_f1 = -1
    max_acc = -1
    max_rec = -1
    max_prec = -1
    train_acc = -1

    for ep in range(1, config['epochs']):
        train(ep, train_idxs)
        tloss = evaluate(model, test_idxs, fold, train_idxs)
    fold += 1

================================================
FILE: DepressionCollected/Classification/text_features_whole.py
================================================
import numpy as np
import pandas as pd
import wave
import librosa
import re
# from allennlp.commands.elmo import ElmoEmbedder
import os
prefix = os.path.abspath(os.path.join(os.getcwd(), "."))
from elmoformanylangs import Embedder
import pkuseg
import thulac
# from pyhanlp import HanLP
import jieba
# seg = pkuseg.pkuseg()
# thu1 = thulac.thulac(seg_only=True)
elmo = Embedder('/Users/linlin/Desktop/SpeechRecognition/DepressionCode/ELMoForManyLangs/zhs.model')

topics = ['positive', 'neutral', 'negative']
answers = {}
text_features = []
text_targets = []

def extract_features(text_features, text_targets, path):
    for index in range(114):
        if os.path.isdir(os.path.join(prefix, path, str(index+1))):
            answers[index+1] = []
            for topic in topics:
                with open(os.path.join(prefix, path, str(index+1), '%s.txt'%(topic)) ,'r') as f:
                    lines = f.readlines()[0]
                    # seg_text = seg.cut(lines) 
                    # seg_text = thu1.cut(lines)
                    # seg_text_iter = HanLP.segment(lines) 
                    seg_text_iter = jieba.cut(lines, cut_all=False) 
                    answers[index+1].append([item for item in seg_text_iter])
                    # answers[dir].append(seg_text)
            with open(os.path.join(prefix, '{1}/{0}/new_label.txt'.format(index+1, path))) as fli:
                target = float(fli.readline())
            # text_targets.append(1 if target >= 53 else 0)
            text_targets.append(target)
            text_features.append([np.array(item).mean(axis=0) for item in elmo.sents2elmo(answers[index+1])])

extract_features(text_features, text_targets, 'Data')
extract_features(text_features, text_targets, 'ValidationData')

print("Saving npz file locally...")
np.savez(os.path.join(prefix, 'Features/TextWhole/whole_samples_reg_avg.npz'), text_features)
np.savez(os.path.join(prefix, 'Features/TextWhole/whole_labels_reg_avg.npz'), text_targets)
    

================================================
FILE: DepressionCollected/DAICFeatureExtarction/feature_extraction.py
================================================
import os
import sys
sys.path.append('/Users/linlin/Desktop/DepressionCollected')
from Classification.audio_features_whole import wav2vlad

import numpy as np
import pandas as pd
import wave

prefix = os.getcwd()
train_split_df = pd.read_csv(os.path.join(prefix, 'DAIC/train_split_Depression_AVEC2017.csv'))
test_split_df = pd.read_csv(os.path.join(prefix, 'DAIC/dev_split_Depression_AVEC2017.csv'))
train_split_num = train_split_df[['Participant_ID']]['Participant_ID'].tolist()
test_split_num = test_split_df[['Participant_ID']]['Participant_ID'].tolist()
train_split_clabel = train_split_df[['PHQ8_Binary']]['PHQ8_Binary'].tolist()
test_split_clabel = test_split_df[['PHQ8_Binary']]['PHQ8_Binary'].tolist()
train_split_rlabel = train_split_df[['PHQ8_Score']]['PHQ8_Score'].tolist()
test_split_rlabel = test_split_df[['PHQ8_Score']]['PHQ8_Score'].tolist()

with open('./queries.txt') as f:
    queries = f.readlines()

def identify_topics(sentence):
    for query in queries:
        query = query.strip('\n')
        sentence = sentence.strip('\n')
        if query == sentence:
            return True
    return False

def extract_features(number):
    transcript = pd.read_csv(os.path.join(prefix, 'DAIC/{0}_P/{0}_TRANSCRIPT.csv'.format(number)), sep='\t').fillna('')
    
    wavefile = wave.open(os.path.join(prefix, 'DAIC/{0}_P/{0}_AUDIO.wav'.format(number, 'r')))
    sr = wavefile.getframerate()
    nframes = wavefile.getnframes()
    wave_data = np.frombuffer(wavefile.readframes(nframes), dtype=np.short)
    
    response = ''
    start_time = 0
    stop_time = 0
    feats = []
    signal = []

    for t in transcript.itertuples():
        # 问题开始
        if getattr(t,'speaker') == 'Ellie' and (identify_topics(getattr(t,'value')) or 'i think i have asked everything' in getattr(t,'value')):
            # 初始化
            response = ''
            if len(signal) == 0:
                continue
            feats.append(wav2vlad(signal, sr))
            signal = []
        elif getattr(t,'speaker') == 'Participant':
            if 'scrubbed_entry' in getattr(t,'value'):
                continue
            start_time = int(getattr(t,'start_time')*sr)
            stop_time = int(getattr(t,'stop_time')*sr)
            response += (' ' + getattr(t,'value'))
            signal = np.hstack((signal, wave_data[start_time:stop_time].astype(np.float)))
    
    print(np.shape(feats))
    print('{}_P feature done'.format(number))
    return feats
    
# training set
audio_features_train = []
audio_ctargets_train = []
audio_rtargets_train = []

# test set
audio_features_test = []
audio_ctargets_test = []
audio_rtargets_test = []

# training set
for index in range(len(train_split_num)):
    feat = extract_features(train_split_num[index])
    audio_features_train.append(feat)
    audio_ctargets_train.append(train_split_clabel[index])
    audio_rtargets_train.append(train_split_rlabel[index])
    
print("Saving npz file locally...")
np.savez(os.path.join(prefix, 'DAICCode/Features/train_samples_clf.npz'), audio_features_train)
np.savez(os.path.join(prefix, 'DAICCode/Features/train_samples_reg.npz'), audio_features_train)
np.savez(os.path.join(prefix, 'DAICCode/Features/train_labels_clf.npz'), audio_ctargets_train)
np.savez(os.path.join(prefix, 'DAICCode/Features/train_labels_reg.npz'), audio_rtargets_train)

# test set
for index in range(len(test_split_num)):
    feat = extract_features(test_split_num[index])
    audio_features_test.append(feat)
    audio_ctargets_test.append(test_split_clabel[index])
    audio_rtargets_test.append(test_split_rlabel[index])

print("Saving npz file locally...")
np.savez(os.path.join(prefix, 'DAICCode/Features/test_samples_clf.npz'), audio_features_test)
np.savez(os.path.join(prefix, 'DAICCode/Features/test_samples_reg.npz'), audio_features_test)
np.savez(os.path.join(prefix, 'DAICCode/Features/test_labels_clf.npz'), audio_ctargets_test)
np.savez(os.path.join(prefix, 'DAICCode/Features/test_labels_reg.npz'), audio_rtargets_test)


================================================
FILE: DepressionCollected/DAICFeatureExtarction/queries.txt
================================================
how are you doing today
where are you from originally
why'd you move to l_a
how do you like l_a
what are some things you really like about l_a
how easy was it for you to get used to living in l_a
what are some things you don't really like about l_a
what'd you study at school
are you still doing that
what's your dream job
do you travel a lot
why
how often do you go back to your hometown
do you consider yourself an introvert
what do you do to relax
how are you at controlling your temper
when was the last time you argued with someone and what was it about
how did you feel in that moment
tell me more about that
how close are you to them
how do you know them
what are some things you like to do for fun
who's someone that's been a positive influence in your life
can you tell me about that
how close are you to your family
is there anything you regret
what made you decide to do that
could you have done anything to avoid it
what's one of your most memorable experiences
what's it like for you living with them
how do you like your living situation
do you have roommates
how easy is it for you to get a good night's sleep
do you feel that way often
what are you like when you don't sleep well
do you feel down
have you been diagnosed with depression
have you ever been diagnosed with p_t_s_d
have you ever served in the military
when was the last time you felt really happy
what do you think of today's kids
can you give me an example of that
what do you do when you're annoyed
when was the last time that happened
how would your best friend describe you
where do you live
how hard is that
what do you do now
are you happy you did that
what are some things that make you really mad
what do you do to relax
like what
are you still working in that
<laughter> can you give me an example of that
do you feel down 
like what 
how do you cope with them
have you noticed any changes in your behavior or thoughts lately
do you have disturbing thoughts
how easy is it for you to get a good night sleep
what do you enjoy about traveling
i'd love to hear about one of your trips
what advice would you give yourself ten or twenty years ago     
what are some things you really like about l_a 
how are you at controlling your temper 
has that gotten you in trouble
do you find it easy to be a parent
what's the hardest thing about being a parent
tell me about your kids
what's one of your most memorable experiences 
how did you feel in that moment 
have you ever served in the military 
have you been diagnosed with depression 
how would you best friend describe you 
what'd you study at school 
nice are you still doing that
what are some things that make you really mad 
could you have done anything to avoid it 
could you say a little more about that 
when was the last time you argued with someone and what was it about 
<laughter> do you travel a lot
when was the last time that happened 
have you ever been diagnosed with p_t_s_d 
how would your best friend describe you 
when was the last time you felt really happy
how did you decide to do that
okay could you have done anything to avoid it
do you feel like therapy is useful
did you think you had a problem before you found out
how has seeing a therapist affected you
what sort of changes have you noticed since you've been going to therapy
why did you stop
who's someone that's been a positive influence in your life 
when did you move to l_a
how often do you go back to your home town
what got you to seek help
what were your symptoms
yeah what do you enjoy about traveling
okay what's the best thing about being a parent
when was the last time you argued with someone and what was it about
<laughter> could you say a little more about that
how long ago were you diagnosed
so how are you doing today
could you say a little more about that
do you still go to therapy now
do you feel like therapy's useful
have you noticed any changes in your behavior or thoughts lately 
tell me about that
what would you say are some of your best qualities
what are some things that usually put you in a good mood
what are you most proud of in your life
how does it compare to l_a
tell me about something you did recently that you really enjoyed
is going to a therapist helping you
how have you been feeling lately
are they triggered by something
what's the best thing about being a parent
why'd you decide to enlist in the military
how old were you when you joined the military
how did serving in the military change you
what did you do after the military
when'd you move to l_a
how has seeing a therapist affected you 
who's someone that's been a positive influence in  your life
what are some things you like to do for fun who's someone that's been a positive influence in your life 
what was it about
do you think that maybe you're being a little hard on yourself
so how are you doing today 
where are you from originally 
how easy was it for you to get used to living in l_a 
what are some things you don't really like about l_a 
how often to you go back to your home town 
why 
how close are you to your family <asks do you travel a lot simultaneously>
do you travel a lot 
what do you enjoy about traveling 
i'd love to hear about one of your trips 
do you consider yourself an introvert  
can you give me an example of that 
what do you do when you're annoyed 
what do you do to relax 
what's your dream job 
how long ago were you diagnosed 
what got you to seek help 
do you feel like therapy's useful 
do you still go to therapy now 
what sort of changes have you noticed since you've been going to therapy 
how have you been feeling lately 
tell me more about that 
what would you say are some of your best qualities 
what are some things that usually put you in a good mood  
when was the last time you felt really happy 
who's someone that's been a positive influence in your life 
how do you know them 
how close are you to them 
what are you most proud of in your life 
are you still doing that 
do you consider yourself an introvert 
do you feel that way often 
how do you like your living situation 
do you have roommates 
how easy is it for you to get a good night's sleep 
what are you like when you don't sleep well 
what advice would you give yourself ten or twenty years ago 
how close are you to your family 
tell me about something you did recently that you really enjoyed 
what are some things that usually put you in a good mood 
why why
what made you decide to go and see someone
okay so how are you doing today
why'd you move to l_a 
how often do you go back to your hometown 
how did you decide to do that 
is there anything you regret 
could you have done anything to avoid it  
how easy is it for you to get a good night's sleep  
do you find it easy to be a parent 
what's the best thing about being a parent 
what's the hardest thing about being a parent 
and please feel free to tell me anything you answers are totally confidential
and please feel free to tell me anything you're answers are totally confidential
what made you decide to do that 
what advice would you give yourself ten or twenty years ago  
what do you think of today's kids 
tell me about that 
how hard is that 
can you tell me about that 
so how are you doing today  
are you still working in that 
what are some things you like to do for fun 
that's good where are you from originally 
when was the last time you argued with someone and what was it about  
where do you live 
did you think you had a problem before you found out 
what were your symptoms 
why did you stop 
okay so how are you doing today  
what do you do now 
are you happy you did that 
are they triggered by something 
how do you cope with them 
has that gotten you in trouble 
what are you
what are some things that make you really mad  
how has seeing a therapist affected you  
yeah how hard is that
mhm what are some things you don't really like about l_a
mhm how did you decide to do that 
how close are you to your family do you find it easy to be a parent 
that's good what do you think of today's kids 
awesome how did you decide to do that 
uh huh uh huh uh huh is there anything you regret is there anything you regret
how old were you when you joined the military 
did you ever see combat 
how did serving in the military change you 
what did you do after the military 
how easy was it for you to go back to civilian life 
is going to a therapist helping you   
that's good where are you from originally
tell me about your kids 
yeah how hard is that 
do you think that maybe you're being a little hard on yourself 
do you consider yourself and introvert
how often do you go back to your home town 
how_doingV (so how are you doing today)
where_originally (where are you from originally)
like_about_LA (what are some things you really like about l_a)
dont_like_LA (what are some things you don't really like about l_a)
study (what did you study at school)
still_doing_X (are you still doing that)
change_directions (what made you decide to do that)
happy_didthat (are you happy you did that)
job_virtually (i love my job you could almost say it's virtually made for me what's your dream job)
shyoutgoing (do you consider yourself more shy or outgoing)
tell_about_that (can you tell me about that)
relax_fishtank (sometimes when i'm feeling tense i turn on the fish tank screensaver hey i know it's not hawaii but it's the best i've got what do you do to relax)
control_temper (how are you at controlling your temper)
last_argument (when was the last time you argued with someone and what was it about)
hard_decisionB (tell me about the hardest decision you've ever had to make)
family_relationship (tell me about your relationship with your family)
feelguilty (what's something you feel guilty about)
give_example (can you give me an example of that)
describe_felt (how did you feel in that moment)
ptsd_diagnosed (have you ever been diagnosed with p_t_s_d)
depression_diagnosed (have you been diagnosed with depression)
easy_sleep (how easy is it for you to get a good night's sleep)
feel_down (do you feel down)
behavior_changes (have you noticed any changes in your behavior or thoughts lately)
happy_lasttime (tell me about the last time you felt really happy)
self_change (what are some things you wish you could change about yourself)
symptoms_cope (how do you cope with them)
regret (is there anything you regret)
advice_back (what advice would you give to yourself ten or twenty years ago)
Ellie17Dec2012_08 (what are you most proud of in your life)
difficult (how hard is that)
BF_describe (how would your best friend describe you)
ideal_weekendC (tell me how you spend your ideal weekend)
asked_everything (okay i think i have asked everything i need to)
travel_shoes (i'm sure you can tell by my shoes i'm not much of a world explorer do you travel a lot)
like_what (like what)
travel_trips (i'd love to hear about one of your trips)
still_working_on_X (are you still working in that)
dream_job (what's your dream job)
situation_handled (tell me about a situation that you wish you had handled differently)
why_enlist (why'd you decide to enlist in the military)
old (how old were you when you joined the military)
combat (did you ever see combat)
why2 (why)
effectB (how did serving in the military change you)
after (what did you do after the military)
civilian_life (how easy was it for you to go back to civilian life)
feel_lately (how have you been feeling lately)
therapy_useful (do you feel like therapy is useful)
why_seek_help (what got you to seek help)
therapy_going (do you still go to therapy now)
therapist_affect (how has seeing a therapist affected you)
landed_trouble (has that gotten you in trouble)
when_LA (when did you move to l_a)
often_backB (how often do you go back to your hometown)
compares_LA (how does it compare to l_a)
why_LA (why did you move to l_a)
adapted_LA (how easy was it for you to get used to living in l_a)
hard_decision (how did you decide to do that)
easy_parent (do you find it easy to be a parent)
parent_hardest (what's the hardest thing about being a parent)
parent_best (what's the best thing about being a parent)
parent_differences (what are some ways that you're different as a parent than your parents)
military (have you ever served in the military)
too_hard (do you think that maybe you're being a little hard on yourself)
Ellie17Dec2012_07 (what would you say are some of your best qualities)
memorableB (what's one of your most memorable experiences)
travel_changed (what do you enjoy about traveling)
memory_erase (tell me about an event or something that you wish you could erase from your memory)
bouts_symptoms (when was the last time that happened)
argument_about (what was it about)
avoid (could you have done anything to avoid it)
trigger (are they triggered by something)
sleep_affects (what are you like when you don't sleep well)
when_diagnosed (how long ago were you diagnosed)
therapy_changes (what sort of changes have you noticed since you've been going to therapy)
feelbadly (tell me about a time when someone made you feel really badly about yourself)
more (tell me more about that)
disturbing_thoughts (do you have disturbing thoughts)
Ellie17Dec2012_10 (tell me about something you did recently that you really enjoyed)
Ellie17Dec2012_09 (what are some things that usually put you in a good mood)
do_fun (what are some things you like to do for fun)
influence_positive (who's someone that's been a positive influence in your life)
how_close (how close are you to them)
tell_me_about (tell me about that)
suspect_problem (did you think you had a problem before you found out)
symptoms_what (what were your symptoms)
how_know (how do you know them)
therapist_useful (is going to a therapist helping you)
stop_going (why did you stop)
mad_makeyou (what are some things that make you really mad)
where_live (where do you live)
roommates (do you have roommates)
living_situation (how do you like your living situation)
what_do_when_annoyed (what do you do when you are annoyed)
elaborate (could you say a little more about that)
family_roleB (how close are you to your family)
todays_kids (what do you think of today's kids)
tell_me_moreV2 (can you tell me more about that)
kids_elaborate (tell me about your kids)

================================================
FILE: DepressionCollected/Regression/AudioModelChecking.py
================================================
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import os
import pickle
import random
import itertools


prefix = os.path.abspath(os.path.join(os.getcwd(), "./"))
audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_256.npz'))['arr_0'], axis=2)
audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_256.npz'))['arr_0']

audio_dep_idxs = np.where(audio_targets >= 53)[0]
audio_non_idxs = np.where(audio_targets < 53)[0]
dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True)
non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True)

config = {
    'num_classes': 1,
    'dropout': 0.5,
    'rnn_layers': 2,
    'embedding_size': 256,
    'batch_size': 4,
    'epochs': 100,
    'learning_rate': 5e-5,
    'hidden_dims': 256,
    'bidirectional': False,
    'cuda': False
}

class AudioBiLSTM(nn.Module):
    def __init__(self, config):
        super(AudioBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()

    def init_weight(net):
        for name, param in net.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True))
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        self.lstm_net_audio = nn.GRU(self.embedding_size,
                                self.hidden_dims,
                                num_layers=self.rnn_layers,
                                dropout=self.dropout,
                                bidirectional=self.bidirectional,
                                batch_first=True)
        # self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims,
        #                         num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True)

        self.bn = nn.BatchNorm1d(3)

        # FC层
        self.fc_audio = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            nn.ReLU(),
            # nn.Softmax(dim=1)
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        #         h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
       # print(atten_w.shape, m.transpose(1, 2).shape)
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        x, _ = self.lstm_net_audio(x)
        # x = self.bn(x)
        x = x.sum(dim=1)
        out = self.fc_audio(x)
        return out

def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)
 
def evaluate(fold, model):
    model.eval()
    batch_idx = 1
    total_loss = 0
    global min_mae, min_rmse, test_dep_idxs, test_non_idxs
    pred = np.array([])
    X_test = audio_features[list(test_dep_idxs)+list(test_non_idxs)]
    Y_test = audio_targets[list(test_dep_idxs)+list(test_non_idxs)]
    with torch.no_grad():
        if config['cuda']:
            x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\
                Variable(torch.from_numpy(Y_test)).cuda()
        else:
            x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor)

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y.view_as(output))
        total_loss += loss.item()
        pred = output.flatten().detach().numpy()

        mae = mean_absolute_error(Y_test, pred)
        rmse = np.sqrt(mean_squared_error(Y_test, pred))

        print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
        print('='*89)
fold = 2
audio_lstm_model = torch.load(os.path.join(prefix, 'Model/Regression/Audio%d/gru_vlad256_256_8.25.pt'%(fold+1)))
model = AudioBiLSTM(config)
# model_state_dict = {}
# model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0']
# model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0']
# model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0']
# model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0']

# model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1']
# model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1']
# model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1']
# model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1']

# model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight']
# model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias']
# model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight']
# model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias']
model_state_dict = audio_lstm_model.state_dict()
model.load_state_dict(model_state_dict, strict=True)

test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10]
test_non_idxs = non_idxs[fold*44:(fold+1)*44]
train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp))
train_non_idxs = list(set(non_idxs) - set(test_non_idxs))

# training data augmentation
train_dep_idxs = []
for (i, idx) in enumerate(train_dep_idxs_tmp):
    feat = audio_features[idx]
    if i < 14:
        for i in itertools.permutations(feat, feat.shape[0]):
            audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
            audio_targets = np.hstack((audio_targets, audio_targets[idx]))
            train_dep_idxs.append(len(audio_features)-1)
    else:
        train_dep_idxs.append(idx)

# test data augmentation
# test_dep_idxs = []
# for idx in test_dep_idxs_tmp:
#     feat = audio_features[idx]
#     for i in itertools.permutations(feat, feat.shape[0]):
#         audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
#         audio_targets = np.hstack((audio_targets, audio_targets[idx]))
#         test_dep_idxs.append(len(audio_features)-1)
test_dep_idxs = test_dep_idxs_tmp

optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
criterion = nn.SmoothL1Loss()
# criterion = FocalLoss(class_num=2)
# evaluate(fold, model)
evaluate(fold, model)


================================================
FILE: DepressionCollected/Regression/audio_bilstm_perm.py
================================================
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import os
import pickle
import random
import itertools

prefix = os.path.abspath(os.path.join(os.getcwd(), "./"))
audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_256.npz'))['arr_0'], axis=2)
audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_256.npz'))['arr_0']

# audio_dep_idxs = np.where(audio_targets >= 53)[0]
# audio_non_idxs = np.where(audio_targets < 53)[0]
# dep_orders = random.sample(range(len(audio_dep_idxs)), len(audio_dep_idxs))
# non_orders = random.sample(range(len(audio_non_idxs)), len(audio_non_idxs))
# dep_idxs = audio_dep_idxs[dep_orders]
# non_idxs = audio_non_idxs[non_orders]
# np.save(os.path.join(prefix, 'Features/AudioWhole/dep_idxs'), dep_idxs)
# np.save(os.path.join(prefix, 'Features/AudioWhole/non_idxs'), non_idxs)
dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True)
non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True)

config = {
    'num_classes': 1,
    'dropout': 0.5,
    'rnn_layers': 2,
    'embedding_size': 256,
    'batch_size': 2,
    'epochs': 120,
    'learning_rate': 1e-5,
    'hidden_dims': 256,
    'bidirectional': False,
    'cuda': False
}

class AudioBiLSTM(nn.Module):
    def __init__(self, config):
        super(AudioBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()

    def init_weight(net):
        for name, param in net.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True))
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        self.lstm_net_audio = nn.GRU(self.embedding_size,
                                self.hidden_dims,
                                num_layers=self.rnn_layers,
                                dropout=self.dropout,
                                bidirectional=self.bidirectional,
                                batch_first=True)
        # self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims,
        #                         num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True)

        self.bn = nn.BatchNorm1d(3)

        # FC层
        self.fc_audio = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            nn.ReLU(),
            # nn.Softmax(dim=1)
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        #         h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
       # print(atten_w.shape, m.transpose(1, 2).shape)
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        x, _ = self.lstm_net_audio(x)
        # x = self.bn(x)
        x = x.sum(dim=1)
        out = self.fc_audio(x)
        return out

def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)
 
def train(epoch):
    global lr, train_acc
    model.train()
    batch_idx = 1      
    total_loss = 0
    correct = 0
    pred = np.array([])
    X_train = audio_features[train_dep_idxs+train_non_idxs]
    Y_train = audio_targets[train_dep_idxs+train_non_idxs]
    for i in range(0, X_train.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_train.shape[0]:
            x, y = X_train[i:], Y_train[i:]
        else:
            x, y = X_train[i:(i + config['batch_size'])], Y_train[i:(
                i + config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        else:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(y)).type(torch.FloatTensor)

        # 将模型的参数梯度设置为0
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y.view_as(output))
        # 后向传播调整参数
        loss.backward()
        # 根据梯度更新网络参数
        optimizer.step()
        batch_idx += 1
        # loss.item()能够得到张量中的元素值
        pred = np.hstack((pred, output.flatten().detach().numpy()))
        total_loss += loss.item()
    train_mae = mean_absolute_error(Y_train, pred)

    print('Train Epoch: {:2d}\t Learning rate: {:.4f}\t Loss: {:.4f}\t MAE: {:.4f}\t RMSE: {:.4f}\n '
        .format(epoch + 1, config['learning_rate'], total_loss, train_mae, \
            np.sqrt(mean_squared_error(Y_train, pred))))
    return train_mae


def evaluate(fold, model, train_mae):
    model.eval()
    batch_idx = 1
    total_loss = 0
    global min_mae, min_rmse, test_dep_idxs, test_non_idxs
    pred = np.array([])
    X_test = audio_features[list(test_dep_idxs)+list(test_non_idxs)]
    Y_test = audio_targets[list(test_dep_idxs)+list(test_non_idxs)]
    with torch.no_grad():
        if config['cuda']:
            x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\
                Variable(torch.from_numpy(Y_test)).cuda()
        else:
            x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor)

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y.view_as(output))
        total_loss += loss.item()
        pred = output.flatten().detach().numpy()

        mae = mean_absolute_error(Y_test, pred)
        rmse = np.sqrt(mean_squared_error(Y_test, pred))

        print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
        print('='*89)

        if mae <= min_mae and mae < 8.5 and train_mae < 13:
            min_mae = mae
            min_rmse = rmse
            mode = 'bi' if config['bidirectional'] else 'norm'
            mode ='gru'
            save(model, os.path.join(prefix, 'Model/Regression/Audio{}/{}_vlad{}_{}_{:.2f}'.format(fold+1,mode, config['embedding_size'], config['hidden_dims'], min_mae)))
            print('*' * 64)
            print('model saved: mae: {}\t rmse: {}'.format(min_mae, min_rmse))
            print('*' * 64)

    return total_loss

for fold in range(3):
    test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10]
    test_non_idxs = non_idxs[fold*44:(fold+1)*44]
    train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp))
    train_non_idxs = list(set(non_idxs) - set(test_non_idxs))

    # training data augmentation
    train_dep_idxs = []
    for (i, idx) in enumerate(train_dep_idxs_tmp):
        feat = audio_features[idx]
        if i < 14:
            for i in itertools.permutations(feat, feat.shape[0]):
                audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
                audio_targets = np.hstack((audio_targets, audio_targets[idx]))
                train_dep_idxs.append(len(audio_features)-1)
        else:
            train_dep_idxs.append(idx)

    # test data augmentation
    # test_dep_idxs = []
    # for idx in test_dep_idxs_tmp:
    #     feat = audio_features[idx]
    #     for i in itertools.permutations(feat, feat.shape[0]):
    #         audio_features = np.vstack((audio_features, np.expand_dims(list(i), 0)))
    #         audio_targets = np.hstack((audio_targets, audio_targets[idx]))
    #         test_dep_idxs.append(len(audio_features)-1)
    test_dep_idxs = test_dep_idxs_tmp


    model = AudioBiLSTM(config)

    if config['cuda']:
        model = model.cuda()

    # optimizer = optim.Adam(model.parameters())
    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
    criterion = nn.L1Loss()
    # criterion = FocalLoss(class_num=2)
    min_mae = 100
    min_rmse = 100
    train_mae = 100


    for ep in range(1, config['epochs']):
        train_mae = train(ep)
        tloss = evaluate(fold, model, train_mae)

# ============== prep ==============
# X_test = np.squeeze(np.load(os.path.join(prefix, 'Features/Audio/val_samples_reg_avid256.npz'))['arr_0'], axis=2)
# Y_test = np.load(os.path.join(prefix, 'Features/Audio/val_labels_reg_avid256.npz'))['arr_0']
# ============== prep ==============


# ============== SVM ==============

# from sklearn.svm import SVR
# from sklearn.model_selection import KFold

# X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# kf = KFold(n_splits=3)
# regr = SVR(kernel='linear', gamma='auto')
# maes, rmses = [], []
# for train_index, test_index in kf.split(X):
#     # X_train, X_test = X[train_index], X[test_index]
#     # Y_train, Y_test = Y[train_index], Y[test_index]
#     X_train, Y_train = X[train_index], Y[train_index]
#     regr.fit([f.flatten() for f in X_train], Y_train)
#     pred = regr.predict([f.flatten() for f in X_test])

#     mae = mean_absolute_error(Y_test, pred)
#     rmse = np.sqrt(mean_squared_error(Y_test, pred))
#     maes.append(mae)
#     rmses.append(rmse)

#     print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
#     print('='*89)
#     # break

# print(np.mean(maes), np.mean(rmses))
# ============== SVM ==============

# # ============== DT ==============
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.model_selection import KFold

# X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# kf = KFold(n_splits=3)
# regr = DecisionTreeRegressor(max_depth=100, random_state=0, criterion="mse")
# maes, rmses = [], []
# for train_index, test_index in kf.split(X):
#     # X_train, X_test = X[train_index], X[test_index]
#     # Y_train, Y_test = Y[train_index], Y[test_index]
#     X_train, Y_train = X[train_index], Y[train_index]
#     regr.fit([f.flatten() for f in X_train], Y_train)
#     pred = regr.predict([f.flatten() for f in X_test])

#     mae = mean_absolute_error(Y_test, pred)
#     rmse = np.sqrt(mean_squared_error(Y_test, pred))
#     maes.append(mae)
#     rmses.append(rmse)

#     print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
#     print('='*89)

# print(np.mean(maes), np.mean(rmses))
# # ============== DT ==============

# # ============== RF ==============
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import KFold

# X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# kf = KFold(n_splits=3)
# regr = RandomForestRegressor(max_depth=100, random_state=0, criterion="mse")
# maes, rmses = [], []
# for train_index, test_index in kf.split(X):
#     # X_train, X_test = X[train_index], X[test_index]
#     # Y_train, Y_test = Y[train_index], Y[test_index]
#     X_train, Y_train = X[train_index], Y[train_index]
#     regr.fit([f.flatten() for f in X_train], Y_train)
#     pred = regr.predict([f.flatten() for f in X_test])

#     mae = mean_absolute_error(Y_test, pred)
#     rmse = np.sqrt(mean_squared_error(Y_test, pred))
#     maes.append(mae)
#     rmses.append(rmse)

#     print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
#     print('='*89)

# print(np.mean(maes), np.mean(rmses))
# # ============== RF ==============

# ============== ada ==============
# from sklearn.ensemble import AdaBoostRegressor
# from sklearn.model_selection import KFold

# X = audio_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# Y = audio_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# kf = KFold(n_splits=3)
# regr = AdaBoostRegressor(n_estimators=50)
# maes, rmses = [], []
# for train_index, test_index in kf.split(X):
#     # X_train, X_test = X[train_index], X[test_index]
#     # Y_train, Y_test = Y[train_index], Y[test_index]
#     X_train, Y_train = X[train_index], Y[train_index]
#     regr.fit([f.flatten() for f in X_train], Y_train)
#     pred = regr.predict([f.flatten() for f in X_test])

#     mae = mean_absolute_error(Y_test, pred)
#     rmse = np.sqrt(mean_squared_error(Y_test, pred))
#     maes.append(mae)
#     rmses.append(rmse)

#     print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
#     print('='*89)

# print(np.mean(maes), np.mean(rmses))
# ============== ada ==============


================================================
FILE: DepressionCollected/Regression/fuse_net.py
================================================

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd
import wave
import librosa
from python_speech_features import *
import re
from allennlp.commands.elmo import ElmoEmbedder
import os
import tensorflow.compat.v1 as tf
import itertools

prefix = os.path.abspath(os.path.join(os.getcwd(), "./"))

text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_reg_avg.npz'))['arr_0']
text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_reg_avg.npz'))['arr_0']
audio_features = np.squeeze(np.load(os.path.join(prefix, 'Features/AudioWhole/whole_samples_reg_256.npz'))['arr_0'], axis=2)
audio_targets = np.load(os.path.join(prefix, 'Features/AudioWhole/whole_labels_reg_256.npz'))['arr_0']
fuse_features = [[audio_features[i], text_features[i]] for i in range(text_features.shape[0])]
fuse_targets = text_targets

fuse_dep_idxs = np.where(text_targets >= 53)[0]
fuse_non_idxs = np.where(text_targets < 53)[0]
dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True)
non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True)

text_model_paths = ['Model/Regression/Text1/BiLSTM_128_7.75.pt', 'Model/Regression/Text2/BiLSTM_128_8.46.pt', 'Model/Regression/Text3/BiLSTM_128_8.01.pt']
audio_model_paths = ['Model/Regression/Audio1/gru_vlad256_256_7.60.pt', 'Model/Regression/Audio2/gru_vlad256_256_8.38.pt', 'Model/Regression/Audio3/gru_vlad256_256_8.25.pt']

config = {
    'num_classes': 1,
    'dropout': 0.5,
    'rnn_layers': 2,
    'audio_embed_size': 256,
    'text_embed_size': 1024,
    'batch_size': 4,
    'epochs': 150,
    'learning_rate': 8e-5,
    'audio_hidden_dims': 256,
    'text_hidden_dims': 128,
    'cuda': False,
    'lambda': 1e-2,
}

class TextBiLSTM(nn.Module):
    def __init__(self, config):
        super(TextBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        self.init_weight()
        
    def init_weight(net):
        for name, param in net.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=self.bidirectional)
        
        # self.init_weight()
        
        # FC层
        # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes)
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            nn.ReLU(),
            # nn.Softmax(dim=1),
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        
        # x : [len_seq, batch_size, embedding_dim]
        x = x.permute(1, 0, 2)
        output, (final_hidden_state, final_cell_state) = self.lstm_net(x)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        return self.fc_out(atten_out)

class AudioBiLSTM(nn.Module):
    def __init__(self, config):
        super(AudioBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()

    def init_weight(net):
        for name, param in net.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True))
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        self.lstm_net_audio = nn.GRU(self.embedding_size,
                                self.hidden_dims,
                                num_layers=self.rnn_layers,
                                dropout=self.dropout,
                                bidirectional=self.bidirectional,
                                batch_first=True)
        # self.lstm_net_audio = nn.GRU(self.embedding_size, self.hidden_dims,
        #                         num_layers=self.rnn_layers, dropout=self.dropout, batch_first=True)

        self.bn = nn.BatchNorm1d(3)

        # FC层
        self.fc_audio = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            nn.ReLU(),
            # nn.Softmax(dim=1)
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        #         h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
       # print(atten_w.shape, m.transpose(1, 2).shape)
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        x, _ = self.lstm_net_audio(x)
        # x = self.bn(x)
        x = x.sum(dim=1)
        out = self.fc_audio(x)
        return out

class fusion_net(nn.Module):
    def __init__(self, text_embed_size, text_hidden_dims, rnn_layers, dropout, num_classes, \
         audio_hidden_dims, audio_embed_size):
        super(fusion_net, self).__init__()
        self.text_embed_size = text_embed_size
        self.audio_embed_size = audio_embed_size
        self.text_hidden_dims = text_hidden_dims
        self.audio_hidden_dims = audio_hidden_dims
        self.rnn_layers = rnn_layers
        self.dropout = dropout
        self.num_classes = num_classes
        
        # ============================= TextBiLSTM =================================
        
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.text_hidden_dims, self.text_hidden_dims),
            nn.ReLU(inplace=True)
        )

        # 双层lstm
        self.lstm_net = nn.LSTM(self.text_embed_size, self.text_hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=True)
        # FC层
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.text_hidden_dims, self.text_hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout)
        )
        
        # ============================= TextBiLSTM =================================

        # ============================= AudioBiLSTM =============================

        self.lstm_net_audio = nn.GRU(self.audio_embed_size,
                                self.audio_hidden_dims,
                                num_layers=self.rnn_layers,
                                dropout=self.dropout,
                                bidirectional=False,
                                batch_first=True)

        self.fc_audio = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.audio_hidden_dims, self.audio_hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout)
        )
        
        # ============================= AudioBiLSTM =============================

        # ============================= last fc layer =============================
        # self.bn = nn.BatchNorm1d(self.text_hidden_dims + self.audio_hidden_dims)
        # modal attention
        self.modal_attn = nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.text_hidden_dims + self.audio_hidden_dims, bias=False)
        self.fc_final = nn.Sequential(
            nn.Linear(self.text_hidden_dims + self.audio_hidden_dims, self.num_classes, bias=False),
            nn.ReLU(),
            # nn.Softmax(dim=1),
            # nn.Sigmoid()
        )
        
    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result
    
    def pretrained_feature(self, x):
        with torch.no_grad():
            x_text = []
            x_audio = []
            for ele in x:
                x_text.append(ele[1])
                x_audio.append(ele[0])
            x_text, x_audio = Variable(torch.tensor(x_text).type(torch.FloatTensor), requires_grad=False), Variable(torch.tensor(x_audio).type(torch.FloatTensor), requires_grad=False)
            # ============================= TextBiLSTM =================================
            # x : [len_seq, batch_size, embedding_dim]
            x_text = x_text.permute(1, 0, 2)
            output, (final_hidden_state, _) = self.lstm_net(x_text)
            # output : [batch_size, len_seq, n_hidden * 2]
            output = output.permute(1, 0, 2)
            # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
            final_hidden_state = final_hidden_state.permute(1, 0, 2)
            # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
            # atten_out = self.attention_net(output, final_hidden_state)
            atten_out = self.attention_net_with_w(output, final_hidden_state)
            text_feature = self.fc_out(atten_out)

            # ============================= TextBiLSTM =================================

            # ============================= AudioBiLSTM =============================

            x_audio, _ = self.lstm_net_audio(x_audio)
            x_audio = x_audio.sum(dim=1)
            audio_feature = self.fc_audio(x_audio)

        # ============================= AudioBiLSTM =============================
        return (text_feature, audio_feature)
        
    def forward(self, x): 
        # x = self.bn(x)
        modal_weights = torch.sigmoid(self.modal_attn(x))
        # modal_weights = self.modal_attn(x)
        x = (modal_weights * x)
        output = self.fc_final(x)
        return output
    
class MyLoss(nn.Module):
    def __init__(self):
        super(MyLoss, self).__init__()
        
    def forward(self, text_feature, audio_feature, target, model):
        weight = model.fc_final[0].weight
        # bias = model.fc_final[0].bias
        # print(weight, bias)
        pred_text = F.linear(text_feature, weight[:, :config['text_hidden_dims']])
        pred_audio = F.linear(audio_feature, weight[:, config['text_hidden_dims']:])
        # l = nn.CrossEntropyLoss()
        l = nn.SmoothL1Loss()
        target = torch.tensor(target).view_as(pred_text).float()
        return l(pred_text, target) + l(pred_audio, target)

def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)

def train(model, epoch):
    global max_train_acc, train_acc
    model.train()
    batch_idx = 1
    total_loss = 0
    correct = 0
    pred = np.array([])
    X_train = []
    Y_train = []
    for idx in train_dep_idxs+train_non_idxs:
        X_train.append(fuse_features[idx])
        Y_train.append(fuse_targets[idx])
    for i in range(0, len(X_train), config['batch_size']):
        if i + config['batch_size'] > len(X_train):
            x, y = X_train[i:], Y_train[i:]
        else:
            x, y = X_train[i:(i+config['batch_size'])], Y_train[i:(i+config['batch_size'])]
        # 将模型的参数梯度设置为0
        optimizer.zero_grad()
        text_feature, audio_feature = model.pretrained_feature(x)
        audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std()
        text_feature_norm = (text_feature - text_feature.mean())/text_feature.std()
        # concat_x = torch.cat((text_feature_norm, audio_feature_norm), dim=1)
        concat_x = torch.cat((text_feature, audio_feature), dim=1)
        output = model(concat_x)
        # loss = criterion(output, torch.tensor(y).float())
        loss = criterion(text_feature, audio_feature, y, model)
        # 后向传播调整参数
        loss.backward()
        # 根据梯度更新网络参数
        optimizer.step()
        batch_idx += 1
        # loss.item()能够得到张量中的元素值
        pred = np.hstack((pred, output.flatten().detach().numpy()))
        total_loss += loss.item()
    train_mae = mean_absolute_error(Y_train, pred)
    print('Train Epoch: {:2d}\t Learning rate: {:.4f}\t Loss: {:.4f}\t MAE: {:.4f}\t RMSE: {:.4f}\n '
        .format(epoch + 1, config['learning_rate'], total_loss, train_mae, \
            np.sqrt(mean_squared_error(Y_train, pred))))
    return train_mae

def evaluate(model, fold, train_mae):
    model.eval()
    batch_idx = 1
    total_loss = 0
    global min_mae, min_rmse, test_dep_idxs, test_non_idxs
    pred = np.array([])
    X_test = []
    Y_test = []
    for idx in list(test_dep_idxs)+list(test_non_idxs):
        X_test.append(fuse_features[idx])
        Y_test.append(fuse_targets[idx])
    for i in range(0, len(X_test), config['batch_size']):
        if i + config['batch_size'] > len(X_test):
            x, y = X_test[i:], Y_test[i:]
        else:
            x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])]
        text_feature, audio_feature = model.pretrained_feature(x)
        with torch.no_grad():
            audio_feature_norm = (audio_feature - audio_feature.mean())/audio_feature.std()
            text_feature_norm = (text_feature - text_feature.mean())/text_feature.std()
            concat_x = torch.cat((text_feature, audio_feature), dim=1)
            # concat_x = torch.cat((text_feature_norm, audio_feature_norm), dim=1)
            output = model(concat_x)
        # loss = criterion(output, torch.tensor(y).float())
        loss = criterion(text_feature, audio_feature, y, model)
        pred = np.hstack((pred, output.flatten().detach().numpy()))
        total_loss += loss.item()
        
    mae = mean_absolute_error(Y_test, pred)
    rmse = np.sqrt(mean_squared_error(Y_test, pred))

    print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
    print('='*89)

    if mae <= min_mae and mae < 8.2 and train_mae < 13:
        min_mae = mae
        min_rmse = rmse
        save(model, os.path.join(prefix, 'Model/Regression/Fuse{}/fuse_{:.2f}'.format(fold+1, min_mae)))
        print('*' * 64)
        print('model saved: mae: {}\t rmse: {}'.format(min_mae, min_rmse))
        print('*' * 64)

    return total_loss

def evaluate_audio(model):
    model.eval()
    batch_idx = 1
    total_loss = 0
    global min_mae, min_rmse, test_dep_idxs, test_non_idxs
    pred = np.array([])
    X_test = []
    Y_test = []
    for idx in list(test_dep_idxs)+list(test_non_idxs):
        X_test.append(fuse_features[idx][0])
        Y_test.append(fuse_targets[idx])
    X_test = np.array(X_test)
    Y_test = np.array(Y_test)

    with torch.no_grad():
        if config['cuda']:
            x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\
                Variable(torch.from_numpy(Y_test)).cuda()
        else:
            x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor)

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y.view_as(output))
        total_loss += loss.item()
        pred = output.flatten().detach().numpy()

        mae = mean_absolute_error(Y_test, pred)
        rmse = np.sqrt(mean_squared_error(Y_test, pred))

        print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
        print('='*89)

def evaluate_text(model):
    model.eval()
    batch_idx = 1
    total_loss = 0
    global min_mae, min_rmse, test_dep_idxs, test_non_idxs
    pred = np.array([])
    X_test = []
    Y_test = []
    for idx in list(test_dep_idxs)+list(test_non_idxs):
        X_test.append(fuse_features[idx][1])
        Y_test.append(fuse_targets[idx])
    X_test = np.array(X_test)
    Y_test = np.array(Y_test)
    criterion = nn.SmoothL1Loss()
    with torch.no_grad():
        if config['cuda']:
            x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\
                Variable(torch.from_numpy(Y_test)).cuda()
        else:
            x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor)

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y.view_as(output))
        total_loss += loss.item()
        pred = output.flatten().detach().numpy()

        mae = mean_absolute_error(Y_test, pred)
        rmse = np.sqrt(mean_squared_error(Y_test, pred))

        print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
        print('='*89)

for fold in range(3):
    test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10]
    test_non_idxs = non_idxs[fold*44:(fold+1)*44]
    train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp))
    train_non_idxs = list(set(non_idxs) - set(test_non_idxs))

    train_dep_idxs = []
    test_dep_idxs = []
    # depression data augmentation
    for (i, idx) in enumerate(train_dep_idxs_tmp):
        feat = fuse_features[idx]
        audio_perm = itertools.permutations(feat[0], 3)
        text_perm = itertools.permutations(feat[1], 3)
        if i < 14:
            for fuse_perm in zip(audio_perm, text_perm):
                fuse_features.append(fuse_perm)
                fuse_targets = np.hstack((fuse_targets, fuse_targets[idx]))
                train_dep_idxs.append(len(fuse_features)-1)
        else:
            train_dep_idxs.append(idx)

    test_dep_idxs = test_dep_idxs_tmp

    model = fusion_net(config['text_embed_size'], config['text_hidden_dims'], config['rnn_layers'], \
    config['dropout'], config['num_classes'], config['audio_hidden_dims'], config['audio_embed_size'])

    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
    # optimizer = optim.Adam(model.parameters())
    # criterion = nn.SmoothL1Loss()
    criterion = MyLoss()

    text_lstm_model = torch.load(os.path.join(prefix, text_model_paths[fold]))
    audio_lstm_model = torch.load(os.path.join(prefix, audio_model_paths[fold]))
    model_state_dict = {}
    model_state_dict['lstm_net_audio.weight_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l0']
    model_state_dict['lstm_net_audio.weight_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l0']
    model_state_dict['lstm_net_audio.bias_ih_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l0']
    model_state_dict['lstm_net_audio.bias_hh_l0'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l0']

    model_state_dict['lstm_net_audio.weight_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_ih_l1']
    model_state_dict['lstm_net_audio.weight_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.weight_hh_l1']
    model_state_dict['lstm_net_audio.bias_ih_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_ih_l1']
    model_state_dict['lstm_net_audio.bias_hh_l1'] = audio_lstm_model.state_dict()['lstm_net_audio.bias_hh_l1']

    model_state_dict['fc_audio.1.weight'] = audio_lstm_model.state_dict()['fc_audio.1.weight']
    model_state_dict['fc_audio.1.bias'] = audio_lstm_model.state_dict()['fc_audio.1.bias']
    model_state_dict['fc_audio.4.weight'] = audio_lstm_model.state_dict()['fc_audio.4.weight']
    model_state_dict['fc_audio.4.bias'] = audio_lstm_model.state_dict()['fc_audio.4.bias']
    model.load_state_dict(text_lstm_model.state_dict(), strict=False)
    # model.load_state_dict(audio_lstm_model.state_dict(), strict=False)
    model.load_state_dict(model_state_dict, strict=False)
    
    for param in model.parameters():
        param.requires_grad = True

    model.fc_final[0].weight.requires_grad = True
    # model.fc_final[0].bias.requires_grad = True
    model.modal_attn.weight.requires_grad = True
    min_mae = 100
    min_rmse = 100
    train_mae = 100

    for ep in range(1, config['epochs']):
        train_mae = train(model, ep)
        tloss = evaluate(model, fold, train_mae)
    # evaluate_audio(audio_lstm_model)
    # evaluate_text(text_lstm_model)

================================================
FILE: DepressionCollected/Regression/text_bilstm_perm.py
================================================
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import os
import pickle
import random
import itertools

prefix = os.path.abspath(os.path.join(os.getcwd(), "../"))
text_features = np.load(os.path.join(prefix, 'Features/TextWhole/whole_samples_reg_avg.npz'))['arr_0']
text_targets = np.load(os.path.join(prefix, 'Features/TextWhole/whole_labels_reg_avg.npz'))['arr_0']

dep_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/dep_idxs.npy'), allow_pickle=True)
non_idxs = np.load(os.path.join(prefix, 'Features/AudioWhole/non_idxs.npy'), allow_pickle=True)

config = {
    'num_classes': 1,
    'dropout': 0.5,
    'rnn_layers': 2,
    'embedding_size': 1024,
    'batch_size': 2,
    'epochs': 110,
    'learning_rate': 1e-5,
    'hidden_dims': 128,
    'bidirectional': True,
    'cuda': False,
}

class TextBiLSTM(nn.Module):
    def __init__(self, config):
        super(TextBiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']
        self.bidirectional = config['bidirectional']

        self.build_model()
        self.init_weight()
        
    def init_weight(net):
        for name, param in net.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)

    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=self.bidirectional)
        
        # self.init_weight()
        
        # FC层
        # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes)
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes),
            nn.ReLU(),
            # nn.Softmax(dim=1),
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # h = lstm_out
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        
        # x : [len_seq, batch_size, embedding_dim]
        x = x.permute(1, 0, 2)
        output, (final_hidden_state, final_cell_state) = self.lstm_net(x)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        return self.fc_out(atten_out)

def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)
 
def train(epoch):
    global lr, train_acc
    model.train()
    batch_idx = 1      
    total_loss = 0
    correct = 0
    pred = np.array([])
    X_train = text_features[train_dep_idxs+train_non_idxs]
    Y_train = text_targets[train_dep_idxs+train_non_idxs]
    for i in range(0, X_train.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_train.shape[0]:
            x, y = X_train[i:], Y_train[i:]
        else:
            x, y = X_train[i:(i + config['batch_size'])], Y_train[i:(
                i + config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        else:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(y)).type(torch.FloatTensor)

        # 将模型的参数梯度设置为0
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y.view_as(output))
        # 后向传播调整参数
        loss.backward()
        # 根据梯度更新网络参数
        optimizer.step()
        batch_idx += 1
        # loss.item()能够得到张量中的元素值
        pred = np.hstack((pred, output.flatten().detach().numpy()))
        total_loss += loss.item()
    train_mae = mean_absolute_error(Y_train, pred)

    print('Train Epoch: {:2d}\t Learning rate: {:.4f}\t Loss: {:.4f}\t MAE: {:.4f}\t RMSE: {:.4f}\n '
        .format(epoch + 1, config['learning_rate'], total_loss, train_mae, \
            np.sqrt(mean_squared_error(Y_train, pred))))
    return train_mae


def evaluate(fold, model, train_mae):
    model.eval()
    batch_idx = 1
    total_loss = 0
    global min_mae, min_rmse, test_dep_idxs, test_non_idxs
    pred = np.array([])
    X_test = text_features[list(test_dep_idxs)+list(test_non_idxs)]
    Y_test = text_targets[list(test_dep_idxs)+list(test_non_idxs)]
    with torch.no_grad():
        if config['cuda']:
            x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True).cuda(),\
                Variable(torch.from_numpy(Y_test)).cuda()
        else:
            x, y = Variable(torch.from_numpy(X_test).type(torch.FloatTensor), requires_grad=True), \
                Variable(torch.from_numpy(Y_test)).type(torch.FloatTensor)

        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y.view_as(output))
        total_loss += loss.item()
        pred = output.flatten().detach().numpy()

        mae = mean_absolute_error(Y_test, pred)
        rmse = np.sqrt(mean_squared_error(Y_test, pred))

        print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
        print('='*89)

        if mae <= min_mae and mae < 8.5 and train_mae < 13:
            min_mae = mae
            min_rmse = rmse
            mode = 'bi' if config['bidirectional'] else 'norm'
            mode ='gru'
            save(model, os.path.join(prefix, 'Model/Regression/Text{}/BiLSTM_{}_{:.2f}'.format(fold+1, config['hidden_dims'], min_mae)))
            print('*' * 64)
            print('model saved: mae: {}\t rmse: {}'.format(min_mae, min_rmse))
            print('*' * 64)

    return total_loss

for fold in range(3):
    test_dep_idxs_tmp = dep_idxs[fold*10:(fold+1)*10]
    test_non_idxs = non_idxs[fold*44:(fold+1)*44]
    train_dep_idxs_tmp = list(set(dep_idxs) - set(test_dep_idxs_tmp))
    train_non_idxs = list(set(non_idxs) - set(test_non_idxs))

    # training data augmentation
    train_dep_idxs = []
    for (i, idx) in enumerate(train_dep_idxs_tmp):
        feat = text_features[idx]
        if i < 14:
            for i in itertools.permutations(feat, feat.shape[0]):
                text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
                text_targets = np.hstack((text_targets, text_targets[idx]))
                train_dep_idxs.append(len(text_features)-1)
        else:
            train_dep_idxs.append(idx)

    # test data augmentation
    # test_dep_idxs = []
    # for idx in test_dep_idxs_tmp:
    #     feat = text_features[idx]
    #     for i in itertools.permutations(feat, feat.shape[0]):
    #         text_features = np.vstack((text_features, np.expand_dims(list(i), 0)))
    #         text_targets = np.hstack((text_targets, text_targets[idx]))
    #         test_dep_idxs.append(len(text_features)-1)
    test_dep_idxs = test_dep_idxs_tmp


    model = TextBiLSTM(config)

    if config['cuda']:
        model = model.cuda()

    optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
    criterion = nn.SmoothL1Loss()
    # criterion = FocalLoss(class_num=2)
    min_mae = 100
    min_rmse = 100
    train_mae = 100


    for ep in range(1, config['epochs']):
        train_mae = train(ep)
        tloss = evaluate(fold, model, train_mae)

# ============== prep ==============
# X_test = np.squeeze(np.load(os.path.join(prefix, 'Features/Audio/val_samples_reg_avid256.npz'))['arr_0'], axis=2)
# Y_test = np.load(os.path.join(prefix, 'Features/Audio/val_labels_reg_avid256.npz'))['arr_0']
# ============== prep ==============


# ============== SVM ==============

# from sklearn.svm import SVR
# from sklearn.model_selection import KFold

# X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# kf = KFold(n_splits=3)
# regr = SVR(kernel='linear', gamma='auto')
# maes, rmses = [], []
# for train_index, test_index in kf.split(X):
#     # X_train, X_test = X[train_index], X[test_index]
#     # Y_train, Y_test = Y[train_index], Y[test_index]
#     X_train, Y_train = X[train_index], Y[train_index]
#     regr.fit([f.flatten() for f in X_train], Y_train)
#     pred = regr.predict([f.flatten() for f in X_test])

#     mae = mean_absolute_error(Y_test, pred)
#     rmse = np.sqrt(mean_squared_error(Y_test, pred))
#     maes.append(mae)
#     rmses.append(rmse)

#     print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
#     print('='*89)
#     # break

# print(np.mean(maes), np.mean(rmses))
# ============== SVM ==============

# # ============== DT ==============
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.model_selection import KFold

# X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# kf = KFold(n_splits=3)
# regr = DecisionTreeRegressor(max_depth=100, random_state=0, criterion="mse")
# maes, rmses = [], []
# for train_index, test_index in kf.split(X):
#     # X_train, X_test = X[train_index], X[test_index]
#     # Y_train, Y_test = Y[train_index], Y[test_index]
#     X_train, Y_train = X[train_index], Y[train_index]
#     regr.fit([f.flatten() for f in X_train], Y_train)
#     pred = regr.predict([f.flatten() for f in X_test])

#     mae = mean_absolute_error(Y_test, pred)
#     rmse = np.sqrt(mean_squared_error(Y_test, pred))
#     maes.append(mae)
#     rmses.append(rmse)

#     print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
#     print('='*89)

# print(np.mean(maes), np.mean(rmses))
# # ============== DT ==============

# # ============== RF ==============
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import KFold

# X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# kf = KFold(n_splits=3)
# regr = RandomForestRegressor(max_depth=100, random_state=0, criterion="mse")
# maes, rmses = [], []
# for train_index, test_index in kf.split(X):
#     # X_train, X_test = X[train_index], X[test_index]
#     # Y_train, Y_test = Y[train_index], Y[test_index]
#     X_train, Y_train = X[train_index], Y[train_index]
#     regr.fit([f.flatten() for f in X_train], Y_train)
#     pred = regr.predict([f.flatten() for f in X_test])

#     mae = mean_absolute_error(Y_test, pred)
#     rmse = np.sqrt(mean_squared_error(Y_test, pred))
#     maes.append(mae)
#     rmses.append(rmse)

#     print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
#     print('='*89)

# print(np.mean(maes), np.mean(rmses))
# # ============== RF ==============

# ============== ada ==============
# from sklearn.ensemble import AdaBoostRegressor
# from sklearn.model_selection import KFold

# X = text_features[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# Y = text_targets[train_dep_idxs+train_non_idxs+test_dep_idxs+test_non_idxs]
# kf = KFold(n_splits=3)
# regr = AdaBoostRegressor(n_estimators=50)
# maes, rmses = [], []
# for train_index, test_index in kf.split(X):
#     # X_train, X_test = X[train_index], X[test_index]
#     # Y_train, Y_test = Y[train_index], Y[test_index]
#     X_train, Y_train = X[train_index], Y[train_index]
#     regr.fit([f.flatten() for f in X_train], Y_train)
#     pred = regr.predict([f.flatten() for f in X_test])

#     mae = mean_absolute_error(Y_test, pred)
#     rmse = np.sqrt(mean_squared_error(Y_test, pred))
#     maes.append(mae)
#     rmses.append(rmse)

#     print('MAE: {:.4f}\t RMSE: {:.4f}\n'.format(mae, rmse))
#     print('='*89)

# print(np.mean(maes), np.mean(rmses))
# ============== ada ==============


================================================
FILE: README.md
================================================
# ICASSP2022-Depression
Automatic Depression Detection: a GRU/ BiLSTM-based Model and An Emotional Audio-Textual Corpus

https://arxiv.org/pdf/2202.08210.pdf

https://ieeexplore.ieee.org/abstract/document/9746569/

## Code

- Regression
  - audio_bilstm_perm.py: train audio network 
  - text_bilstm_perm.py: train text network 
  - fuse_net.py: train multi-modal network
- Classification
  - audio_features_whole.py: extract audio features
  - text_features_whole.py: extract text features
  - audio_gru_whole.py: train audio network 
  - text_bilstm_whole.py: train text network
  - fuse_net_whole.py: train fuse network


## Dataset: EATD-Corpus

The EATD-Corpus is a dataset consist of audio and text files of 162 volunteers who received counseling.

### How to download
The EATD-Corpus can be downloaded at https://1drv.ms/u/s!AsGVGqImbOwYhHUHcodFC3xmKZKK?e=mCT5oN. Password: Ymj26Uv5

### How to use

Training set contains data from 83 volunteers (19 depressed and 64 non-depressed).

Validation set contains data from 79 volunteers (11 depressed and 68 non-depressed).

Each folder contains depression data for one volunteer.

- {positive/negative/neutral}.wav: Raw audio in wav
- {positive/negative/neutral}_out.wav: Preprocessed audio. Preprocessing operations include denoising and de-muting
- {positive/negative/neutral}.txt: Audio translation
- label.txt: Raw SDS score
- new_label.txt: Standard SDS score (Raw SDS score multiplied by 1.25)