Repository: diogenes0319/FedMD_clean Branch: master Commit: ab7a07b02c97 Files: 14 Total size: 46.1 MB Directory structure: gitextract_qsrhd5uv/ ├── CIFAR_Balanced.py ├── CIFAR_Imbalanced.py ├── FEMNIST_Balanced.py ├── FEMNIST_Imbalanced.py ├── FedMD.py ├── Neural_Networks.py ├── README.md ├── conf/ │ ├── CIFAR_balance_conf.json │ ├── CIFAR_imbalance_conf.json │ ├── EMNIST_balance_conf.json │ └── EMNIST_imbalance_conf.json ├── data_utils.py ├── dataset/ │ └── emnist-letters.mat └── utility.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: CIFAR_Balanced.py ================================================ import os import errno import argparse import sys import pickle import numpy as np import pandas as pd from tensorflow.keras.models import load_model from data_utils import load_CIFAR_data, generate_partial_data, generate_bal_private_data from FedMD import FedMD from Neural_Networks import train_models, cnn_2layer_fc_model, cnn_3layer_fc_model def parseArg(): parser = argparse.ArgumentParser(description='FedMD, a federated learning framework. \ Participants are training collaboratively. ') parser.add_argument('-conf', metavar='conf_file', nargs=1, help='the config file for FedMD.' ) conf_file = os.path.abspath("conf/CIFAR_balance_conf.json") if len(sys.argv) > 1: args = parser.parse_args(sys.argv[1:]) if args.conf: conf_file = args.conf[0] return conf_file CANDIDATE_MODELS = {"2_layer_CNN": cnn_2layer_fc_model, "3_layer_CNN": cnn_3layer_fc_model} if __name__ == "__main__": conf_file = parseArg() with open(conf_file, "r") as f: conf_dict = eval(f.read()) #n_classes = conf_dict["n_classes"] model_config = conf_dict["models"] pre_train_params = conf_dict["pre_train_params"] model_saved_dir = conf_dict["model_saved_dir"] model_saved_names = conf_dict["model_saved_names"] is_early_stopping = conf_dict["early_stopping"] public_classes = conf_dict["public_classes"] private_classes = conf_dict["private_classes"] n_classes = len(public_classes) + len(private_classes) emnist_data_dir = conf_dict["EMNIST_dir"] N_parties = conf_dict["N_parties"] N_samples_per_class = conf_dict["N_samples_per_class"] N_rounds = conf_dict["N_rounds"] N_alignment = conf_dict["N_alignment"] N_private_training_round = conf_dict["N_private_training_round"] private_training_batchsize = conf_dict["private_training_batchsize"] N_logits_matching_round = conf_dict["N_logits_matching_round"] logits_matching_batchsize = conf_dict["logits_matching_batchsize"] result_save_dir = conf_dict["result_save_dir"] del conf_dict, conf_file X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10 \ = load_CIFAR_data(data_type="CIFAR10", standarized = True, verbose = True) public_dataset = {"X": X_train_CIFAR10, "y": y_train_CIFAR10} X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100 \ = load_CIFAR_data(data_type="CIFAR100", standarized = True, verbose = True) # only use those CIFAR100 data whose y_labels belong to private_classes X_train_CIFAR100, y_train_CIFAR100 \ = generate_partial_data(X = X_train_CIFAR100, y= y_train_CIFAR100, class_in_use = private_classes, verbose = True) X_test_CIFAR100, y_test_CIFAR100 \ = generate_partial_data(X = X_test_CIFAR100, y= y_test_CIFAR100, class_in_use = private_classes, verbose = True) # relabel the selected CIFAR100 data for future convenience for index, cls_ in enumerate(private_classes): y_train_CIFAR100[y_train_CIFAR100 == cls_] = index + len(public_classes) y_test_CIFAR100[y_test_CIFAR100 == cls_] = index + len(public_classes) del index, cls_ print(pd.Series(y_train_CIFAR100).value_counts()) mod_private_classes = np.arange(len(private_classes)) + len(public_classes) print("="*60) #generate private data private_data, total_private_data\ =generate_bal_private_data(X_train_CIFAR100, y_train_CIFAR100, N_parties = N_parties, classes_in_use = mod_private_classes, N_samples_per_class = N_samples_per_class, data_overlap = False) print("="*60) X_tmp, y_tmp = generate_partial_data(X = X_test_CIFAR100, y= y_test_CIFAR100, class_in_use = mod_private_classes, verbose = True) private_test_data = {"X": X_tmp, "y": y_tmp} del X_tmp, y_tmp parties = [] if model_saved_dir is None: for i, item in enumerate(model_config): model_name = item["model_type"] model_params = item["params"] tmp = CANDIDATE_MODELS[model_name](n_classes=n_classes, input_shape=(32,32,3), **model_params) print("model {0} : {1}".format(i, model_saved_names[i])) print(tmp.summary()) parties.append(tmp) del model_name, model_params, tmp #END FOR LOOP pre_train_result = train_models(parties, X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10, save_dir = model_saved_dir, save_names = model_saved_names, early_stopping = is_early_stopping, **pre_train_params ) else: dpath = os.path.abspath(model_saved_dir) model_names = os.listdir(dpath) for name in model_names: tmp = None tmp = load_model(os.path.join(dpath ,name)) parties.append(tmp) del X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10, \ X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100, fedmd = FedMD(parties, public_dataset = public_dataset, private_data = private_data, total_private_data = total_private_data, private_test_data = private_test_data, N_rounds = N_rounds, N_alignment = N_alignment, N_logits_matching_round = N_logits_matching_round, logits_matching_batchsize = logits_matching_batchsize, N_private_training_round = N_private_training_round, private_training_batchsize = private_training_batchsize) initialization_result = fedmd.init_result pooled_train_result = fedmd.pooled_train_result collaboration_performance = fedmd.collaborative_training() if result_save_dir is not None: save_dir_path = os.path.abspath(result_save_dir) #make dir try: os.makedirs(save_dir_path) except OSError as e: if e.errno != errno.EEXIST: raise with open(os.path.join(save_dir_path, 'pre_train_result.pkl'), 'wb') as f: pickle.dump(pre_train_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'init_result.pkl'), 'wb') as f: pickle.dump(initialization_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'pooled_train_result.pkl'), 'wb') as f: pickle.dump(pooled_train_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'col_performance.pkl'), 'wb') as f: pickle.dump(collaboration_performance, f, protocol=pickle.HIGHEST_PROTOCOL) ================================================ FILE: CIFAR_Imbalanced.py ================================================ import os import errno import argparse import sys import pickle import numpy as np import pandas as pd from tensorflow.keras.models import load_model from data_utils import load_CIFAR_data, load_CIFAR_from_local, generate_partial_data, generate_imbal_CIFAR_private_data from FedMD import FedMD from Neural_Networks import train_models, cnn_2layer_fc_model, cnn_3layer_fc_model def parseArg(): parser = argparse.ArgumentParser(description='FedMD, a federated learning framework. \ Participants are training collaboratively. ') parser.add_argument('-conf', metavar='conf_file', nargs=1, help='the config file for FedMD.' ) conf_file = os.path.abspath("conf/CIFAR_imbalance_conf.json") if len(sys.argv) > 1: args = parser.parse_args(sys.argv[1:]) if args.conf: conf_file = args.conf[0] return conf_file CANDIDATE_MODELS = {"2_layer_CNN": cnn_2layer_fc_model, "3_layer_CNN": cnn_3layer_fc_model} if __name__ == "__main__": conf_file = parseArg() with open(conf_file, "r") as f: conf_dict = eval(f.read()) #n_classes = conf_dict["n_classes"] model_config = conf_dict["models"] pre_train_params = conf_dict["pre_train_params"] model_saved_dir = conf_dict["model_saved_dir"] model_saved_names = conf_dict["model_saved_names"] is_early_stopping = conf_dict["early_stopping"] public_classes = conf_dict["public_classes"] public_classes.sort() private_classes = conf_dict["private_classes"] private_classes.sort() n_classes = len(public_classes) + len(private_classes) emnist_data_dir = conf_dict["EMNIST_dir"] N_parties = conf_dict["N_parties"] N_samples_per_class = conf_dict["N_samples_per_class"] N_rounds = conf_dict["N_rounds"] N_alignment = conf_dict["N_alignment"] N_private_training_round = conf_dict["N_private_training_round"] private_training_batchsize = conf_dict["private_training_batchsize"] N_logits_matching_round = conf_dict["N_logits_matching_round"] logits_matching_batchsize = conf_dict["logits_matching_batchsize"] result_save_dir = conf_dict["result_save_dir"] del conf_dict, conf_file X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10 \ = load_CIFAR_data(data_type="CIFAR10", standarized = True, verbose = True) public_dataset = {"X": X_train_CIFAR10, "y": y_train_CIFAR10} X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100 \ = load_CIFAR_data(data_type="CIFAR100", standarized = True, verbose = True) a_, y_train_super, b_, y_test_super \ = load_CIFAR_data(data_type="CIFAR100", label_mode="coarse", standarized = True, verbose = True) del a_, b_ # X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10 \ # = load_CIFAR_from_local(local_dir="./dataset/CIFAR10/", # data_type="CIFAR10", # standarized = True, verbose = True) # public_dataset = {"X": X_train_CIFAR10, "y": y_train_CIFAR10} # X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100 \ # = load_CIFAR_from_local(local_dir="./dataset/CIFAR100/", # data_type="CIFAR100", with_coarse_label = False, # standarized = True, verbose = True) # a_, y_train_super, b_, y_test_super \ # = load_CIFAR_from_local(local_dir="./dataset/CIFAR100/", # data_type="CIFAR100", with_coarse_label = True, # standarized = True, verbose = True) # del a_, b_ #Find the relations between superclasses and subclasses relations = [set() for i in range(np.max(y_train_super)+1)] for i, y_fine in enumerate(y_train_CIFAR100): relations[y_train_super[i]].add(y_fine) for i in range(len(relations)): relations[i]=list(relations[i]) del i, y_fine #print(relations) #print(np.array(relations).shape) fine_classes_in_use = [[relations[j][i%5] for j in private_classes] for i in range(N_parties)] print(fine_classes_in_use) #Generate test set X_tmp, y_tmp = generate_partial_data(X_test_CIFAR100, y_test_super, class_in_use = private_classes, verbose = True) #print(pd.Series(y_tmp).value_counts()) # relabel the selected CIFAR100 data for future convenience for index in range(len(private_classes)-1, -1, -1): cls_ = private_classes[index] y_tmp[y_tmp == cls_] = index + len(public_classes) #print(pd.Series(y_tmp).value_counts()) private_test_data = {"X": X_tmp, "y": y_tmp} del index, cls_, X_tmp, y_tmp print("="*60) #generate private data private_data, total_private_data \ = generate_imbal_CIFAR_private_data(X_train_CIFAR100, y_train_CIFAR100, y_train_super, N_parties = N_parties, classes_per_party = fine_classes_in_use, samples_per_class = N_samples_per_class) for index in range(len(private_classes)-1, -1, -1): cls_ = private_classes[index] total_private_data["y"][total_private_data["y"] == cls_] = index + len(public_classes) for i in range(N_parties): private_data[i]["y"][private_data[i]["y"] == cls_] = index + len(public_classes) del index, cls_ # for i in range(N_parties): # print("iter:", i) # print(pd.Series(private_data[i]["y"]).value_counts()) # print(pd.Series(total_private_data["y"]).value_counts()) mod_private_classes = np.arange(len(private_classes)) + len(public_classes) print("=" * 60) parties = [] if model_saved_dir is None: for i, item in enumerate(model_config): model_name = item["model_type"] model_params = item["params"] tmp = CANDIDATE_MODELS[model_name](n_classes=n_classes, input_shape=(32,32,3), **model_params) print("model {0} : {1}".format(i, model_saved_names[i])) print(tmp.summary()) parties.append(tmp) del model_name, model_params, tmp #END FOR LOOP pre_train_result = train_models(parties, X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10, save_dir = model_saved_dir, save_names = model_saved_names, early_stopping = is_early_stopping, **pre_train_params ) else: dpath = os.path.abspath(model_saved_dir) model_names = os.listdir(dpath) for name in model_names: tmp = None tmp = load_model(os.path.join(dpath ,name)) parties.append(tmp) del X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10, \ X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100, y_train_super, y_test_super fedmd = FedMD(parties, public_dataset = public_dataset, private_data = private_data, total_private_data = total_private_data, private_test_data = private_test_data, N_rounds = N_rounds, N_alignment = N_alignment, N_logits_matching_round = N_logits_matching_round, logits_matching_batchsize = logits_matching_batchsize, N_private_training_round = N_private_training_round, private_training_batchsize = private_training_batchsize) initialization_result = fedmd.init_result pooled_train_result = fedmd.pooled_train_result collaboration_performance = fedmd.collaborative_training() if result_save_dir is not None: save_dir_path = os.path.abspath(result_save_dir) #make dir try: os.makedirs(save_dir_path) except OSError as e: if e.errno != errno.EEXIST: raise with open(os.path.join(save_dir_path, 'pre_train_result.pkl'), 'wb') as f: pickle.dump(pre_train_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'init_result.pkl'), 'wb') as f: pickle.dump(initialization_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'pooled_train_result.pkl'), 'wb') as f: pickle.dump(pooled_train_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'col_performance.pkl'), 'wb') as f: pickle.dump(collaboration_performance, f, protocol=pickle.HIGHEST_PROTOCOL) ================================================ FILE: FEMNIST_Balanced.py ================================================ import os import errno import argparse import sys import pickle import numpy as np from tensorflow.keras.models import load_model from data_utils import load_MNIST_data, load_EMNIST_data, generate_bal_private_data,\ generate_partial_data from FedMD import FedMD from Neural_Networks import train_models, cnn_2layer_fc_model, cnn_3layer_fc_model def parseArg(): parser = argparse.ArgumentParser(description='FedMD, a federated learning framework. \ Participants are training collaboratively. ') parser.add_argument('-conf', metavar='conf_file', nargs=1, help='the config file for FedMD.' ) conf_file = os.path.abspath("conf/EMNIST_balance_conf.json") if len(sys.argv) > 1: args = parser.parse_args(sys.argv[1:]) if args.conf: conf_file = args.conf[0] return conf_file CANDIDATE_MODELS = {"2_layer_CNN": cnn_2layer_fc_model, "3_layer_CNN": cnn_3layer_fc_model} if __name__ == "__main__": conf_file = parseArg() with open(conf_file, "r") as f: conf_dict = eval(f.read()) #n_classes = conf_dict["n_classes"] model_config = conf_dict["models"] pre_train_params = conf_dict["pre_train_params"] model_saved_dir = conf_dict["model_saved_dir"] model_saved_names = conf_dict["model_saved_names"] is_early_stopping = conf_dict["early_stopping"] public_classes = conf_dict["public_classes"] private_classes = conf_dict["private_classes"] n_classes = len(public_classes) + len(private_classes) emnist_data_dir = conf_dict["EMNIST_dir"] N_parties = conf_dict["N_parties"] N_samples_per_class = conf_dict["N_samples_per_class"] N_rounds = conf_dict["N_rounds"] N_alignment = conf_dict["N_alignment"] N_private_training_round = conf_dict["N_private_training_round"] private_training_batchsize = conf_dict["private_training_batchsize"] N_logits_matching_round = conf_dict["N_logits_matching_round"] logits_matching_batchsize = conf_dict["logits_matching_batchsize"] result_save_dir = conf_dict["result_save_dir"] del conf_dict, conf_file X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST \ = load_MNIST_data(standarized = True, verbose = True) public_dataset = {"X": X_train_MNIST, "y": y_train_MNIST} X_train_EMNIST, y_train_EMNIST, X_test_EMNIST, y_test_EMNIST, writer_ids_train, writer_ids_test \ = load_EMNIST_data(emnist_data_dir, standarized = True, verbose = True) y_train_EMNIST += len(public_classes) y_test_EMNIST += len(public_classes) #generate private data private_data, total_private_data \ = generate_bal_private_data(X_train_EMNIST, y_train_EMNIST, N_parties = N_parties, classes_in_use = private_classes, N_samples_per_class = N_samples_per_class, data_overlap = False) X_tmp, y_tmp = generate_partial_data(X = X_test_EMNIST, y= y_test_EMNIST, class_in_use = private_classes, verbose = True) private_test_data = {"X": X_tmp, "y": y_tmp} del X_tmp, y_tmp parties = [] if model_saved_dir is None: for i, item in enumerate(model_config): model_name = item["model_type"] model_params = item["params"] tmp = CANDIDATE_MODELS[model_name](n_classes=n_classes, input_shape=(28,28), **model_params) print("model {0} : {1}".format(i, model_saved_names[i])) print(tmp.summary()) parties.append(tmp) del model_name, model_params, tmp #END FOR LOOP pre_train_result = train_models(parties, X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST, save_dir = model_saved_dir, save_names = model_saved_names, early_stopping = is_early_stopping, **pre_train_params ) else: dpath = os.path.abspath(model_saved_dir) model_names = os.listdir(dpath) for name in model_names: tmp = None tmp = load_model(os.path.join(dpath ,name)) parties.append(tmp) del X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST, \ X_train_EMNIST, y_train_EMNIST, X_test_EMNIST, y_test_EMNIST, writer_ids_train, writer_ids_test fedmd = FedMD(parties, public_dataset = public_dataset, private_data = private_data, total_private_data = total_private_data, private_test_data = private_test_data, N_rounds = N_rounds, N_alignment = N_alignment, N_logits_matching_round = N_logits_matching_round, logits_matching_batchsize = logits_matching_batchsize, N_private_training_round = N_private_training_round, private_training_batchsize = private_training_batchsize) initialization_result = fedmd.init_result pooled_train_result = fedmd.pooled_train_result collaboration_performance = fedmd.collaborative_training() if result_save_dir is not None: save_dir_path = os.path.abspath(result_save_dir) #make dir try: os.makedirs(save_dir_path) except OSError as e: if e.errno != errno.EEXIST: raise with open(os.path.join(save_dir_path, 'pre_train_result.pkl'), 'wb') as f: pickle.dump(pre_train_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'init_result.pkl'), 'wb') as f: pickle.dump(initialization_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'pooled_train_result.pkl'), 'wb') as f: pickle.dump(pooled_train_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'col_performance.pkl'), 'wb') as f: pickle.dump(collaboration_performance, f, protocol=pickle.HIGHEST_PROTOCOL) ================================================ FILE: FEMNIST_Imbalanced.py ================================================ import os import errno import argparse import sys import pickle import numpy as np from tensorflow.keras.models import load_model from data_utils import load_MNIST_data, load_EMNIST_data, generate_EMNIST_writer_based_data, generate_partial_data from FedMD import FedMD from Neural_Networks import train_models, cnn_2layer_fc_model, cnn_3layer_fc_model def parseArg(): parser = argparse.ArgumentParser(description='FedMD, a federated learning framework. \ Participants are training collaboratively. ') parser.add_argument('-conf', metavar='conf_file', nargs=1, help='the config file for FedMD.' ) conf_file = os.path.abspath("conf/EMNIST_imbalance_conf.json") if len(sys.argv) > 1: args = parser.parse_args(sys.argv[1:]) if args.conf: conf_file = args.conf[0] return conf_file CANDIDATE_MODELS = {"2_layer_CNN": cnn_2layer_fc_model, "3_layer_CNN": cnn_3layer_fc_model} if __name__ == "__main__": conf_file = parseArg() with open(conf_file, "r") as f: conf_dict = eval(f.read()) #n_classes = conf_dict["n_classes"] model_config = conf_dict["models"] pre_train_params = conf_dict["pre_train_params"] model_saved_dir = conf_dict["model_saved_dir"] model_saved_names = conf_dict["model_saved_names"] is_early_stopping = conf_dict["early_stopping"] public_classes = conf_dict["public_classes"] private_classes = conf_dict["private_classes"] n_classes = len(public_classes) + len(private_classes) emnist_data_dir = conf_dict["EMNIST_dir"] N_parties = conf_dict["N_parties"] N_samples_per_class = conf_dict["N_samples_per_class"] N_rounds = conf_dict["N_rounds"] N_alignment = conf_dict["N_alignment"] N_private_training_round = conf_dict["N_private_training_round"] private_training_batchsize = conf_dict["private_training_batchsize"] N_logits_matching_round = conf_dict["N_logits_matching_round"] logits_matching_batchsize = conf_dict["logits_matching_batchsize"] result_save_dir = conf_dict["result_save_dir"] del conf_dict, conf_file X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST \ = load_MNIST_data(standarized = True, verbose = True) public_dataset = {"X": X_train_MNIST, "y": y_train_MNIST} X_train_EMNIST, y_train_EMNIST, X_test_EMNIST, y_test_EMNIST, \ writer_ids_train_EMNIST, writer_ids_test_EMNIST \ = load_EMNIST_data(emnist_data_dir, standarized = True, verbose = True) y_train_EMNIST += len(public_classes) y_test_EMNIST += len(public_classes) #generate private data private_data, total_private_data\ =generate_EMNIST_writer_based_data(X_train_EMNIST, y_train_EMNIST, writer_ids_train_EMNIST, N_parties = N_parties, classes_in_use = private_classes, N_priv_data_min = N_samples_per_class * len(private_classes) ) X_tmp, y_tmp = generate_partial_data(X = X_test_EMNIST, y= y_test_EMNIST, class_in_use = private_classes, verbose = True) private_test_data = {"X": X_tmp, "y": y_tmp} del X_tmp, y_tmp parties = [] if model_saved_dir is None: for i, item in enumerate(model_config): model_name = item["model_type"] model_params = item["params"] tmp = CANDIDATE_MODELS[model_name](n_classes=n_classes, input_shape=(28,28), **model_params) print("model {0} : {1}".format(i, model_saved_names[i])) print(tmp.summary()) parties.append(tmp) del model_name, model_params, tmp #END FOR LOOP pre_train_result = train_models(parties, X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST, save_dir = model_saved_dir, save_names = model_saved_names, early_stopping = is_early_stopping, **pre_train_params ) else: dpath = os.path.abspath(model_saved_dir) model_names = os.listdir(dpath) for name in model_names: tmp = None tmp = load_model(os.path.join(dpath ,name)) parties.append(tmp) del X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST, \ X_train_EMNIST, y_train_EMNIST, X_test_EMNIST, y_test_EMNIST, writer_ids_train_EMNIST, writer_ids_test_EMNIST fedmd = FedMD(parties, public_dataset = public_dataset, private_data = private_data, total_private_data = total_private_data, private_test_data = private_test_data, N_rounds = N_rounds, N_alignment = N_alignment, N_logits_matching_round = N_logits_matching_round, logits_matching_batchsize = logits_matching_batchsize, N_private_training_round = N_private_training_round, private_training_batchsize = private_training_batchsize) initialization_result = fedmd.init_result pooled_train_result = fedmd.pooled_train_result collaboration_performance = fedmd.collaborative_training() if result_save_dir is not None: save_dir_path = os.path.abspath(result_save_dir) #make dir try: os.makedirs(save_dir_path) except OSError as e: if e.errno != errno.EEXIST: raise with open(os.path.join(save_dir_path, 'pre_train_result.pkl'), 'wb') as f: pickle.dump(pre_train_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'init_result.pkl'), 'wb') as f: pickle.dump(initialization_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'pooled_train_result.pkl'), 'wb') as f: pickle.dump(pooled_train_result, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(save_dir_path, 'col_performance.pkl'), 'wb') as f: pickle.dump(collaboration_performance, f, protocol=pickle.HIGHEST_PROTOCOL) ================================================ FILE: FedMD.py ================================================ import numpy as np from tensorflow.keras.models import clone_model, load_model from tensorflow.keras.callbacks import EarlyStopping import tensorflow as tf from data_utils import generate_alignment_data from Neural_Networks import remove_last_layer class FedMD(): def __init__(self, parties, public_dataset, private_data, total_private_data, private_test_data, N_alignment, N_rounds, N_logits_matching_round, logits_matching_batchsize, N_private_training_round, private_training_batchsize): self.N_parties = len(parties) self.public_dataset = public_dataset self.private_data = private_data self.private_test_data = private_test_data self.N_alignment = N_alignment self.N_rounds = N_rounds self.N_logits_matching_round = N_logits_matching_round self.logits_matching_batchsize = logits_matching_batchsize self.N_private_training_round = N_private_training_round self.private_training_batchsize = private_training_batchsize self.collaborative_parties = [] self.init_result = [] print("start model initialization: ") for i in range(self.N_parties): print("model ", i) model_A_twin = None model_A_twin = clone_model(parties[i]) model_A_twin.set_weights(parties[i].get_weights()) model_A_twin.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), loss = "sparse_categorical_crossentropy", metrics = ["accuracy"]) print("start full stack training ... ") model_A_twin.fit(private_data[i]["X"], private_data[i]["y"], batch_size = 32, epochs = 25, shuffle=True, verbose = 0, validation_data = [private_test_data["X"], private_test_data["y"]], callbacks=[EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=10)] ) print("full stack training done") model_A = remove_last_layer(model_A_twin, loss="mean_absolute_error") self.collaborative_parties.append({"model_logits": model_A, "model_classifier": model_A_twin, "model_weights": model_A_twin.get_weights()}) self.init_result.append({"val_acc": model_A_twin.history.history['val_accuracy'], "train_acc": model_A_twin.history.history['accuracy'], "val_loss": model_A_twin.history.history['val_loss'], "train_loss": model_A_twin.history.history['loss'], }) print() del model_A, model_A_twin #END FOR LOOP print("calculate the theoretical upper bounds for participants: ") self.upper_bounds = [] self.pooled_train_result = [] for model in parties: model_ub = clone_model(model) model_ub.set_weights(model.get_weights()) model_ub.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), loss = "sparse_categorical_crossentropy", metrics = ["accuracy"]) model_ub.fit(total_private_data["X"], total_private_data["y"], batch_size = 32, epochs = 50, shuffle=True, verbose = 0, validation_data = [private_test_data["X"], private_test_data["y"]], callbacks=[EarlyStopping(monitor="val_accuracy", min_delta=0.001, patience=10)]) self.upper_bounds.append(model_ub.history.history["val_accuracy"][-1]) self.pooled_train_result.append({"val_acc": model_ub.history.history["val_accuracy"], "acc": model_ub.history.history["accuracy"]}) del model_ub print("the upper bounds are:", self.upper_bounds) def collaborative_training(self): # start collaborating training collaboration_performance = {i: [] for i in range(self.N_parties)} r = 0 while True: # At beginning of each round, generate new alignment dataset alignment_data = generate_alignment_data(self.public_dataset["X"], self.public_dataset["y"], self.N_alignment) print("round ", r) print("update logits ... ") # update logits logits = 0 for d in self.collaborative_parties: d["model_logits"].set_weights(d["model_weights"]) logits += d["model_logits"].predict(alignment_data["X"], verbose = 0) logits /= self.N_parties # test performance print("test performance ... ") for index, d in enumerate(self.collaborative_parties): y_pred = d["model_classifier"].predict(self.private_test_data["X"], verbose = 0).argmax(axis = 1) collaboration_performance[index].append(np.mean(self.private_test_data["y"] == y_pred)) print(collaboration_performance[index][-1]) del y_pred r+= 1 if r > self.N_rounds: break print("updates models ...") for index, d in enumerate(self.collaborative_parties): print("model {0} starting alignment with public logits... ".format(index)) weights_to_use = None weights_to_use = d["model_weights"] d["model_logits"].set_weights(weights_to_use) d["model_logits"].fit(alignment_data["X"], logits, batch_size = self.logits_matching_batchsize, epochs = self.N_logits_matching_round, shuffle=True, verbose = 0) d["model_weights"] = d["model_logits"].get_weights() print("model {0} done alignment".format(index)) print("model {0} starting training with private data... ".format(index)) weights_to_use = None weights_to_use = d["model_weights"] d["model_classifier"].set_weights(weights_to_use) d["model_classifier"].fit(self.private_data[index]["X"], self.private_data[index]["y"], batch_size = self.private_training_batchsize, epochs = self.N_private_training_round, shuffle=True, verbose = 0) d["model_weights"] = d["model_classifier"].get_weights() print("model {0} done private training. \n".format(index)) #END FOR LOOP #END WHILE LOOP return collaboration_performance ================================================ FILE: Neural_Networks.py ================================================ from tensorflow.keras.models import Model, Sequential, clone_model, load_model from tensorflow.keras.layers import Input, Dense, add, concatenate, Conv2D,Dropout,\ BatchNormalization, Flatten, MaxPooling2D, AveragePooling2D, Activation, Dropout, Reshape from tensorflow.keras.callbacks import EarlyStopping import tensorflow as tf def cnn_3layer_fc_model(n_classes,n1 = 128, n2=192, n3=256, dropout_rate = 0.2,input_shape = (28,28)): model_A, x = None, None x = Input(input_shape) if len(input_shape)==2: y = Reshape((input_shape[0], input_shape[1], 1))(x) else: y = Reshape(input_shape)(x) y = Conv2D(filters = n1, kernel_size = (3,3), strides = 1, padding = "same", activation = None)(y) y = BatchNormalization()(y) y = Activation("relu")(y) y = Dropout(dropout_rate)(y) y = AveragePooling2D(pool_size = (2,2), strides = 1, padding = "same")(y) y = Conv2D(filters = n2, kernel_size = (2,2), strides = 2, padding = "valid", activation = None)(y) y = BatchNormalization()(y) y = Activation("relu")(y) y = Dropout(dropout_rate)(y) y = AveragePooling2D(pool_size = (2,2), strides = 2, padding = "valid")(y) y = Conv2D(filters = n3, kernel_size = (3,3), strides = 2, padding = "valid", activation = None)(y) y = BatchNormalization()(y) y = Activation("relu")(y) y = Dropout(dropout_rate)(y) #y = AveragePooling2D(pool_size = (2,2), strides = 2, padding = "valid")(y) y = Flatten()(y) y = Dense(units = n_classes, activation = None, use_bias = False, kernel_regularizer=tf.keras.regularizers.l2(1e-3))(y) y = Activation("softmax")(y) model_A = Model(inputs = x, outputs = y) model_A.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), loss = "sparse_categorical_crossentropy", metrics = ["accuracy"]) return model_A def cnn_2layer_fc_model(n_classes,n1 = 128, n2=256, dropout_rate = 0.2,input_shape = (28,28)): model_A, x = None, None x = Input(input_shape) if len(input_shape)==2: y = Reshape((input_shape[0], input_shape[1], 1))(x) else: y = Reshape(input_shape)(x) y = Conv2D(filters = n1, kernel_size = (3,3), strides = 1, padding = "same", activation = None)(y) y = BatchNormalization()(y) y = Activation("relu")(y) y = Dropout(dropout_rate)(y) y = AveragePooling2D(pool_size = (2,2), strides = 1, padding = "same")(y) y = Conv2D(filters = n2, kernel_size = (3,3), strides = 2, padding = "valid", activation = None)(y) y = BatchNormalization()(y) y = Activation("relu")(y) y = Dropout(dropout_rate)(y) #y = AveragePooling2D(pool_size = (2,2), strides = 2, padding = "valid")(y) y = Flatten()(y) y = Dense(units = n_classes, activation = None, use_bias = False, kernel_regularizer=tf.keras.regularizers.l2(1e-3))(y) y = Activation("softmax")(y) model_A = Model(inputs = x, outputs = y) model_A.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), loss = "sparse_categorical_crossentropy", metrics = ["accuracy"]) return model_A def remove_last_layer(model, loss = "mean_absolute_error"): """ Input: Keras model, a classification model whose last layer is a softmax activation Output: Keras model, the same model with the last softmax activation layer removed, while keeping the same parameters """ new_model = Model(inputs = model.inputs, outputs = model.layers[-2].output) new_model.set_weights(model.get_weights()) new_model.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), loss = loss) return new_model def train_models(models, X_train, y_train, X_test, y_test, save_dir = "./", save_names = None, early_stopping = True, min_delta = 0.001, patience = 3, batch_size = 128, epochs = 20, is_shuffle=True, verbose = 1 ): ''' Train an array of models on the same dataset. We use early termination to speed up training. ''' resulting_val_acc = [] record_result = [] for n, model in enumerate(models): print("Training model ", n) if early_stopping: model.fit(X_train, y_train, validation_data = [X_test, y_test], callbacks=[EarlyStopping(monitor='val_accuracy', min_delta=min_delta, patience=patience)], batch_size = batch_size, epochs = epochs, shuffle=is_shuffle, verbose = verbose ) else: model.fit(X_train, y_train, validation_data = [X_test, y_test], batch_size = batch_size, epochs = epochs, shuffle=is_shuffle, verbose = verbose ) resulting_val_acc.append(model.history.history["val_accuracy"][-1]) record_result.append({"train_acc": model.history.history["accuracy"], "val_acc": model.history.history["val_accuracy"], "train_loss": model.history.history["loss"], "val_loss": model.history.history["val_loss"]}) if save_dir is not None: save_dir_path = os.path.abspath(save_dir) #make dir try: os.makedirs(save_dir_path) except OSError as e: if e.errno != errno.EEXIST: raise if save_names is None: file_name = save_dir + "model_{0}".format(n) + ".h5" else: file_name = save_dir + save_names[n] + ".h5" model.save(file_name) print("pre-train accuracy: ") print(resulting_val_acc) return record_result ================================================ FILE: README.md ================================================ # FedMD FedMD: Heterogenous Federated Learning via Model Distillation. Preprint on https://arxiv.org/abs/1910.03581. ## Run scripts on Google Colab 1. open a google Colab 2. Clone the project folder from Github ``` ! git clone github_link ``` 3. Then access the folder just created. ``` % cd project_folder/ ``` 4. Run the python script in Colab. For instance ``` ! python FEMNIST_Balanced.py -conf conf/EMNIST_balance_conf.json ``` ================================================ FILE: conf/CIFAR_balance_conf.json ================================================ { "models": [{"model_type": "2_layer_CNN", "params": {"n1": 128, "n2": 256, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 128, "n2": 384, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 128, 'n2': 512, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 256, "n2": 256, "dropout_rate": 0.3}}, {"model_type": "2_layer_CNN", "params": {"n1": 256, "n2": 512, "dropout_rate": 0.4}}, {"model_type": "3_layer_CNN", "params": {"n1": 64, "n2": 128, "n3": 256, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 64, "n2": 128, "n3": 192, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 192, "n3": 256, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 128, "n3": 128, "dropout_rate": 0.3}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 128, "n3": 192, "dropout_rate": 0.3}} ], "pre_train_params": {"min_delta": 0.005, "patience": 3, "batch_size": 128, "epochs": 20, "is_shuffle": True, "verbose": 1}, "model_saved_dir": None, "model_saved_names" : ["CNN_128_256", "CNN_128_384", "CNN_128_512", "CNN_256_256", "CNN_256_512", "CNN_64_128_256", "CNN_64_128_192", "CNN_128_192_256", "CNN_128_128_128", "CNN_128_128_192"], "early_stopping" : True, "N_parties": 10, "N_samples_per_class": 3, "N_alignment": 5000, "private_classes": [0,2,20,63,71,82], "public_classes": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "is_show": False, "N_rounds": 20, "N_logits_matching_round": 1, "N_private_training_round": 4, "private_training_batchsize" : 5, "logits_matching_batchsize": 256, "EMNIST_dir": None, "result_save_dir": "./result_CIFAR_balanced/" } ================================================ FILE: conf/CIFAR_imbalance_conf.json ================================================ { "models": [{"model_type": "2_layer_CNN", "params": {"n1": 128, "n2": 256, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 128, "n2": 384, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 128, 'n2': 512, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 256, "n2": 256, "dropout_rate": 0.3}}, {"model_type": "2_layer_CNN", "params": {"n1": 256, "n2": 512, "dropout_rate": 0.4}}, {"model_type": "3_layer_CNN", "params": {"n1": 64, "n2": 128, "n3": 256, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 64, "n2": 128, "n3": 192, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 192, "n3": 256, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 128, "n3": 128, "dropout_rate": 0.3}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 128, "n3": 192, "dropout_rate": 0.3}} ], "pre_train_params": {"min_delta": 0.005, "patience": 3, "batch_size": 128, "epochs": 20, "is_shuffle": True, "verbose": 1}, "model_saved_dir": None, "model_saved_names" : ["CNN_128_256", "CNN_128_384", "CNN_128_512", "CNN_256_256", "CNN_256_512", "CNN_64_128_256", "CNN_64_128_192", "CNN_128_192_256", "CNN_128_128_128", "CNN_128_128_192"], "early_stopping" : True, "N_parties": 10, "N_samples_per_class": 20, "N_alignment": 5000, "private_classes": [0,1,7,9,12,18], "public_classes": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "is_show": False, "N_rounds": 13, "N_logits_matching_round": 1, "N_private_training_round": 10, "private_training_batchsize" : 10, "logits_matching_batchsize": 128, "EMNIST_dir": None, "result_save_dir": "./result_CIFAR_imbalanced/" } ================================================ FILE: conf/EMNIST_balance_conf.json ================================================ { "models": [{"model_type": "2_layer_CNN", "params": {"n1": 128, "n2": 256, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 128, "n2": 384, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 128, 'n2': 512, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 256, "n2": 256, "dropout_rate": 0.3}}, {"model_type": "2_layer_CNN", "params": {"n1": 256, "n2": 512, "dropout_rate": 0.4}}, {"model_type": "3_layer_CNN", "params": {"n1": 64, "n2": 128, "n3": 256, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 64, "n2": 128, "n3": 192, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 192, "n3": 256, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 128, "n3": 128, "dropout_rate": 0.3}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 128, "n3": 192, "dropout_rate": 0.3}} ], "pre_train_params": {"min_delta": 0.001, "patience": 3, "batch_size": 128, "epochs": 20, "is_shuffle": True, "verbose": 1}, "model_saved_dir": None, "model_saved_names" : ["CNN_128_256", "CNN_128_384", "CNN_128_512", "CNN_256_256", "CNN_256_512", "CNN_64_128_256", "CNN_64_128_192", "CNN_128_192_256", "CNN_128_128_128", "CNN_128_128_192"], "early_stopping" : True, "N_parties": 10, "N_samples_per_class": 3, "N_alignment": 5000, "private_classes": [10, 11, 12, 13, 14, 15], "public_classes": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "is_show": False, "N_rounds": 20, "N_logits_matching_round": 1, "N_private_training_round": 2, "private_training_batchsize" : 5, "logits_matching_batchsize": 256, "EMNIST_dir": "./dataset/emnist-letters.mat", "result_save_dir": "./result_FEMNIST_balanced/" } ================================================ FILE: conf/EMNIST_imbalance_conf.json ================================================ { "models": [{"model_type": "2_layer_CNN", "params": {"n1": 128, "n2": 256, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 128, "n2": 384, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 128, 'n2': 512, "dropout_rate": 0.2}}, {"model_type": "2_layer_CNN", "params": {"n1": 256, "n2": 256, "dropout_rate": 0.3}}, {"model_type": "2_layer_CNN", "params": {"n1": 256, "n2": 512, "dropout_rate": 0.4}}, {"model_type": "3_layer_CNN", "params": {"n1": 64, "n2": 128, "n3": 256, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 64, "n2": 128, "n3": 192, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 192, "n3": 256, "dropout_rate": 0.2}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 128, "n3": 128, "dropout_rate": 0.3}}, {"model_type": "3_layer_CNN", "params": {"n1": 128, "n2": 128, "n3": 192, "dropout_rate": 0.3}} ], "pre_train_params": {"min_delta": 0.001, "patience": 3, "batch_size": 128, "epochs": 20, "is_shuffle": True, "verbose": 1}, "model_saved_dir": None, "model_saved_names" : ["CNN_128_256", "CNN_128_384", "CNN_128_512", "CNN_256_256", "CNN_256_512", "CNN_64_128_256", "CNN_64_128_192", "CNN_128_192_256", "CNN_128_128_128", "CNN_128_128_192"], "early_stopping" : True, "N_parties": 10, "N_samples_per_class": 3, "N_alignment": 5000, "private_classes": [10, 11, 12, 13, 14, 15], "public_classes": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "is_show": False, "N_rounds": 20, "N_logits_matching_round": 1, "N_private_training_round": 4, "private_training_batchsize" : 5, "logits_matching_batchsize": 256, "EMNIST_dir": "./dataset/emnist-letters.mat", "result_save_dir": "./result_FEMNIST_imbalanced/" } ================================================ FILE: data_utils.py ================================================ import pickle import os import numpy as np import pandas as pd from sklearn.model_selection import StratifiedShuffleSplit from tensorflow.keras.datasets import cifar10, cifar100, mnist import scipy.io as sio def load_MNIST_data(standarized = False, verbose = False): (X_train, y_train), (X_test, y_test) = mnist.load_data() if standarized: X_train = X_train/255 X_test = X_test/255 mean_image = np.mean(X_train, axis=0) X_train -= mean_image X_test -= mean_image if verbose == True: print("MNIST dataset ... ") print("X_train shape :", X_train.shape) print("X_test shape :", X_test.shape) print("y_train shape :", y_train.shape) print("y_test shape :", y_test.shape) return X_train, y_train, X_test, y_test def load_EMNIST_data(file, verbose = False, standarized = False): """ file should be the downloaded EMNIST file in .mat format. """ mat = sio.loadmat(file) data = mat["dataset"] writer_ids_train = data['train'][0,0]['writers'][0,0] writer_ids_train = np.squeeze(writer_ids_train) X_train = data['train'][0,0]['images'][0,0] X_train = X_train.reshape((X_train.shape[0], 28, 28), order = "F") y_train = data['train'][0,0]['labels'][0,0] y_train = np.squeeze(y_train) y_train -= 1 #y_train is zero-based writer_ids_test = data['test'][0,0]['writers'][0,0] writer_ids_test = np.squeeze(writer_ids_test) X_test = data['test'][0,0]['images'][0,0] X_test= X_test.reshape((X_test.shape[0], 28, 28), order = "F") y_test = data['test'][0,0]['labels'][0,0] y_test = np.squeeze(y_test) y_test -= 1 #y_test is zero-based if standarized: X_train = X_train/255 X_test = X_test/255 mean_image = np.mean(X_train, axis=0) X_train -= mean_image X_test -= mean_image if verbose == True: print("EMNIST-letter dataset ... ") print("X_train shape :", X_train.shape) print("X_test shape :", X_test.shape) print("y_train shape :", y_train.shape) print("y_test shape :", y_test.shape) return X_train, y_train, X_test, y_test, writer_ids_train, writer_ids_test def load_CIFAR_data(data_type="CIFAR10", label_mode="fine", standarized = False, verbose = False): if data_type == "CIFAR10": (X_train, y_train), (X_test, y_test) = cifar10.load_data() elif data_type == "CIFAR100": (X_train, y_train), (X_test, y_test) = cifar100.load_data(label_mode = label_mode) else: print("Unknown Data type. Stopped!") return None y_train = np.squeeze(y_train) y_test = np.squeeze(y_test) # substract mean and normalized to [-1/2,1/2] if standarized: X_train = X_train/255 X_test = X_test/255 mean_image = np.mean(X_train, axis=0) X_train -= mean_image X_test -= mean_image if verbose == True: print("X_train shape :", X_train.shape) print("X_test shape :", X_test.shape) print("y_train shape :", y_train.shape) print("y_test shape :", y_test.shape) return X_train, y_train, X_test, y_test def load_CIFAR_from_local(local_dir, data_type="CIFAR10", with_coarse_label = False, standarized = False, verbose = False): #dir_name = os.path.abspath(local_dir) if data_type == "CIFAR10": X_train, y_train = [], [] for i in range(1, 6, 1): file_name = None file_name = os.path.join(local_dir + "data_batch_{0}".format(i)) X_tmp, y_tmp = None, None with open(file_name, 'rb') as fo: datadict = pickle.load(fo, encoding='bytes') X_tmp = datadict[b'data'] y_tmp = datadict[b'labels'] X_tmp = X_tmp.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float") y_tmp = np.array(y_tmp) X_train.append(X_tmp) y_train.append(y_tmp) del X_tmp, y_tmp X_train = np.vstack(X_train) y_train = np.hstack(y_train) file_name = None file_name = os.path.join(local_dir + "test_batch") with open(file_name, 'rb') as fo: datadict = pickle.load(fo, encoding='bytes') X_test = datadict[b'data'] y_test = datadict[b'labels'] X_test = X_test.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float") y_test = np.array(y_test) elif data_type == "CIFAR100": file_name = None file_name = os.path.abspath(local_dir + "train") with open(file_name, 'rb') as fo: datadict = pickle.load(fo, encoding='bytes') X_train = datadict[b'data'] if with_coarse_label: y_train = datadict[b'coarse_labels'] else: y_train = datadict[b'fine_labels'] X_train = X_train.reshape(50000, 3, 32, 32).transpose(0,2,3,1).astype("float") y_train = np.array(y_train) file_name = None file_name = os.path.join(local_dir + "test") with open(file_name, 'rb') as fo: datadict = pickle.load(fo, encoding='bytes') X_test = datadict[b'data'] if with_coarse_label: y_test = datadict[b'coarse_labels'] else: y_test = datadict[b'fine_labels'] X_test = X_test.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float") y_test = np.array(y_test) else: print("Unknown Data type. Stopped!") return None if standarized: X_train = X_train/255 X_test = X_test/255 mean_image = np.mean(X_train, axis=0) X_train -= mean_image X_test -= mean_image if verbose == True: print("X_train shape :", X_train.shape) print("X_test shape :", X_test.shape) print("y_train shape :", y_train.shape) print("y_test shape :", y_test.shape) return X_train, y_train, X_test, y_test def generate_partial_data(X, y, class_in_use = None, verbose = False): if class_in_use is None: idx = np.ones_like(y, dtype = bool) else: idx = [y == i for i in class_in_use] idx = np.any(idx, axis = 0) X_incomplete, y_incomplete = X[idx], y[idx] if verbose == True: print("X shape :", X_incomplete.shape) print("y shape :", y_incomplete.shape) return X_incomplete, y_incomplete def generate_bal_private_data(X, y, N_parties = 10, classes_in_use = range(11), N_samples_per_class = 20, data_overlap = False): """ Input: -- N_parties : int, number of collaboraters in this activity; -- classes_in_use: array or generator, the classes of EMNIST-letters dataset (0 <= y <= 25) to be used as private data; -- N_sample_per_class: int, the number of private data points of each class for each party return: """ priv_data = [None] * N_parties combined_idx = np.array([], dtype = np.int16) for cls in classes_in_use: idx = np.where(y == cls)[0] idx = np.random.choice(idx, N_samples_per_class * N_parties, replace = data_overlap) combined_idx = np.r_[combined_idx, idx] for i in range(N_parties): idx_tmp = idx[i * N_samples_per_class : (i + 1)*N_samples_per_class] if priv_data[i] is None: tmp = {} tmp["X"] = X[idx_tmp] tmp["y"] = y[idx_tmp] tmp["idx"] = idx_tmp priv_data[i] = tmp else: priv_data[i]['idx'] = np.r_[priv_data[i]["idx"], idx_tmp] priv_data[i]["X"] = np.vstack([priv_data[i]["X"], X[idx_tmp]]) priv_data[i]["y"] = np.r_[priv_data[i]["y"], y[idx_tmp]] total_priv_data = {} total_priv_data["idx"] = combined_idx total_priv_data["X"] = X[combined_idx] total_priv_data["y"] = y[combined_idx] return priv_data, total_priv_data def generate_alignment_data(X, y, N_alignment = 3000): split = StratifiedShuffleSplit(n_splits=1, train_size= N_alignment) if N_alignment == "all": alignment_data = {} alignment_data["idx"] = np.arange(y.shape[0]) alignment_data["X"] = X alignment_data["y"] = y return alignment_data for train_index, _ in split.split(X, y): X_alignment = X[train_index] y_alignment = y[train_index] alignment_data = {} alignment_data["idx"] = train_index alignment_data["X"] = X_alignment alignment_data["y"] = y_alignment return alignment_data def generate_EMNIST_writer_based_data(X, y, writer_info, N_priv_data_min = 30, N_parties = 5, classes_in_use = range(6)): # mask is a boolean array of the same shape as y # mask[i] = True if y[i] in classes_in_use mask = None mask = [y == i for i in classes_in_use] mask = np.any(mask, axis = 0) df_tmp = None df_tmp = pd.DataFrame({"writer_ids": writer_info, "is_in_use": mask}) #print(df_tmp.head()) groupped = df_tmp[df_tmp["is_in_use"]].groupby("writer_ids") # organize the input the data (X,y) by writer_ids. # That is, # data_by_writer is a dictionary where the keys are writer_ids, # and the contents are the correcponding data. # Notice that only data with labels in class_in_use are included. data_by_writer = {} writer_ids = [] for wt_id, idx in groupped.groups.items(): if len(idx) >= N_priv_data_min: writer_ids.append(wt_id) data_by_writer[wt_id] = {"X": X[idx], "y": y[idx], "idx": idx, "writer_id": wt_id} # each participant in the collaborative group is assigned data # from a single writer. ids_to_use = np.random.choice(writer_ids, size = N_parties, replace = False) combined_idx = np.array([], dtype = np.int64) private_data = [] for i in range(N_parties): id_tmp = ids_to_use[i] private_data.append(data_by_writer[id_tmp]) combined_idx = np.r_[combined_idx, data_by_writer[id_tmp]["idx"]] del id_tmp total_priv_data = {} total_priv_data["idx"] = combined_idx total_priv_data["X"] = X[combined_idx] total_priv_data["y"] = y[combined_idx] return private_data, total_priv_data def generate_imbal_CIFAR_private_data(X, y, y_super, classes_per_party, N_parties, samples_per_class=7): priv_data = [None] * N_parties combined_idxs = [] count = 0 for subcls_list in classes_per_party: idxs_per_party = [] for c in subcls_list: idxs = np.flatnonzero(y == c) idxs = np.random.choice(idxs, samples_per_class, replace=False) idxs_per_party.append(idxs) idxs_per_party = np.hstack(idxs_per_party) combined_idxs.append(idxs_per_party) dict_to_add = {} dict_to_add["idx"] = idxs_per_party dict_to_add["X"] = X[idxs_per_party] #dict_to_add["y"] = y[idxs_per_party] #dict_to_add["y_super"] = y_super[idxs_per_party] dict_to_add["y"] = y_super[idxs_per_party] priv_data[count] = dict_to_add count += 1 combined_idxs = np.hstack(combined_idxs) total_priv_data = {} total_priv_data["idx"] = combined_idxs total_priv_data["X"] = X[combined_idxs] #total_priv_data["y"] = y[combined_idxs] #total_priv_data["y_super"] = y_super[combined_idxs] total_priv_data["y"] = y_super[combined_idxs] return priv_data, total_priv_data ================================================ FILE: dataset/emnist-letters.mat ================================================ [File too large to display: 46.0 MB] ================================================ FILE: utility.py ================================================ import numpy as np import pandas as pd import matplotlib.pyplot as plt def show_dataset_samples(classes, samples_per_class, images, labels, data_type="MNIST"): num_classes = len(classes) fig, axes = plt.subplots(samples_per_class, num_classes, figsize=(num_classes, samples_per_class) ) for col_index, cls in enumerate(classes): idxs = np.flatnonzero(labels == cls) idxs = np.random.choice(idxs, samples_per_class, replace=False) for row_index, idx in enumerate(idxs): if data_type == "MNIST": axes[row_index][col_index].imshow(images[idx], cmap = 'binary', interpolation="nearest") axes[row_index][col_index].axis("off") elif data_type == "CIFAR": axes[row_index][col_index].imshow(images[idx].astype('uint8')) axes[row_index][col_index].axis("off") else: print("Unknown Data type. Unable to plot.") return None if row_index==0: axes[row_index][col_index].set_title("Class {0}".format(cls)) plt.show() return None # def plot_history(model): # """ # input : model is trained keras model. # """ # fig, axes = plt.subplots(2,1, figsize = (12, 6), sharex = True) # axes[0].plot(model.history.history["loss"], "b.-", label = "Training Loss") # axes[0].plot(model.history.history["val_loss"], "k^-", label = "Val Loss") # axes[0].set_xlabel("Epoch") # axes[0].set_ylabel("Loss") # axes[0].legend() # axes[1].plot(model.history.history["acc"], "b.-", label = "Training Acc") # axes[1].plot(model.history.history["val_acc"], "k^-", label = "Val Acc") # axes[1].set_xlabel("Epoch") # axes[1].set_ylabel("Accuracy") # axes[1].legend() # plt.subplots_adjust(hspace=0) # plt.show() # def show_performance(model, Xtrain, ytrain, Xtest, ytest): # y_pred = None # print("CNN+fC Training Accuracy :") # y_pred = model.predict(Xtrain, verbose = 0).argmax(axis = 1) # print((y_pred == ytrain).mean()) # print("CNN+fc Test Accuracy :") # y_pred = model.predict(Xtest, verbose = 0).argmax(axis = 1) # print((y_pred == ytest).mean()) # print("Confusion_matrix : ") # print(confusion_matrix(y_true = ytest, y_pred = y_pred)) # del y_pred