[
  {
    "path": "CIFAR_Balanced.py",
    "content": "import os\nimport errno\nimport argparse\nimport sys\nimport pickle\n\nimport numpy as np\nimport pandas as pd\nfrom tensorflow.keras.models import load_model\n\nfrom data_utils import load_CIFAR_data, generate_partial_data, generate_bal_private_data\nfrom FedMD import FedMD\nfrom Neural_Networks import train_models, cnn_2layer_fc_model, cnn_3layer_fc_model\n\n\ndef parseArg():\n    parser = argparse.ArgumentParser(description='FedMD, a federated learning framework. \\\n    Participants are training collaboratively. ')\n    parser.add_argument('-conf', metavar='conf_file', nargs=1, \n                        help='the config file for FedMD.'\n                       )\n\n    conf_file = os.path.abspath(\"conf/CIFAR_balance_conf.json\")\n    \n    if len(sys.argv) > 1:\n        args = parser.parse_args(sys.argv[1:])\n        if args.conf:\n            conf_file = args.conf[0]\n    return conf_file\n\nCANDIDATE_MODELS = {\"2_layer_CNN\": cnn_2layer_fc_model, \n                    \"3_layer_CNN\": cnn_3layer_fc_model} \n\nif __name__ == \"__main__\":\n    conf_file =  parseArg()\n    with open(conf_file, \"r\") as f:\n        conf_dict = eval(f.read())\n        \n        #n_classes = conf_dict[\"n_classes\"]\n        model_config = conf_dict[\"models\"]\n        pre_train_params = conf_dict[\"pre_train_params\"]\n        model_saved_dir = conf_dict[\"model_saved_dir\"]\n        model_saved_names = conf_dict[\"model_saved_names\"]\n        is_early_stopping = conf_dict[\"early_stopping\"]\n        public_classes = conf_dict[\"public_classes\"]\n        private_classes = conf_dict[\"private_classes\"]\n        n_classes = len(public_classes) + len(private_classes)\n        \n        emnist_data_dir = conf_dict[\"EMNIST_dir\"]    \n        N_parties = conf_dict[\"N_parties\"]\n        N_samples_per_class = conf_dict[\"N_samples_per_class\"]\n        \n        N_rounds = conf_dict[\"N_rounds\"]\n        N_alignment = conf_dict[\"N_alignment\"]\n        N_private_training_round = conf_dict[\"N_private_training_round\"]\n        private_training_batchsize = conf_dict[\"private_training_batchsize\"]\n        N_logits_matching_round = conf_dict[\"N_logits_matching_round\"]\n        logits_matching_batchsize = conf_dict[\"logits_matching_batchsize\"]\n        \n        \n        result_save_dir = conf_dict[\"result_save_dir\"]\n\n    \n    del conf_dict, conf_file\n    \n    X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10 \\\n    = load_CIFAR_data(data_type=\"CIFAR10\", \n                      standarized = True, verbose = True)\n    \n    public_dataset = {\"X\": X_train_CIFAR10, \"y\": y_train_CIFAR10}\n    \n    \n    X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100 \\\n    = load_CIFAR_data(data_type=\"CIFAR100\",\n                      standarized = True, verbose = True)\n    \n    # only use those CIFAR100 data whose y_labels belong to private_classes\n    X_train_CIFAR100, y_train_CIFAR100 \\\n    = generate_partial_data(X = X_train_CIFAR100, y= y_train_CIFAR100,\n                            class_in_use = private_classes, \n                            verbose = True)\n    \n    \n    X_test_CIFAR100, y_test_CIFAR100 \\\n    = generate_partial_data(X = X_test_CIFAR100, y= y_test_CIFAR100,\n                            class_in_use = private_classes, \n                            verbose = True)\n    \n    # relabel the selected CIFAR100 data for future convenience\n    for index, cls_ in enumerate(private_classes):        \n        y_train_CIFAR100[y_train_CIFAR100 == cls_] = index + len(public_classes)\n        y_test_CIFAR100[y_test_CIFAR100 == cls_] = index + len(public_classes)\n    del index, cls_\n    \n    print(pd.Series(y_train_CIFAR100).value_counts())\n    mod_private_classes = np.arange(len(private_classes)) + len(public_classes)\n    \n    print(\"=\"*60)\n    #generate private data\n    private_data, total_private_data\\\n    =generate_bal_private_data(X_train_CIFAR100, y_train_CIFAR100,      \n                               N_parties = N_parties,           \n                               classes_in_use = mod_private_classes, \n                               N_samples_per_class = N_samples_per_class, \n                               data_overlap = False)\n    print(\"=\"*60)\n    X_tmp, y_tmp = generate_partial_data(X = X_test_CIFAR100, y= y_test_CIFAR100,\n                                         class_in_use = mod_private_classes, \n                                         verbose = True)\n    private_test_data = {\"X\": X_tmp, \"y\": y_tmp}\n    del X_tmp, y_tmp\n    \n    parties = []\n    if model_saved_dir is None:\n        for i, item in enumerate(model_config):\n            model_name = item[\"model_type\"]\n            model_params = item[\"params\"]\n            tmp = CANDIDATE_MODELS[model_name](n_classes=n_classes, \n                                               input_shape=(32,32,3),\n                                               **model_params)\n            print(\"model {0} : {1}\".format(i, model_saved_names[i]))\n            print(tmp.summary())\n            parties.append(tmp)\n            \n            del model_name, model_params, tmp\n        #END FOR LOOP\n        pre_train_result = train_models(parties, \n                                        X_train_CIFAR10, y_train_CIFAR10, \n                                        X_test_CIFAR10, y_test_CIFAR10,\n                                        save_dir = model_saved_dir, save_names = model_saved_names,\n                                        early_stopping = is_early_stopping,\n                                        **pre_train_params\n                                       )\n    else:\n        dpath = os.path.abspath(model_saved_dir)\n        model_names = os.listdir(dpath)\n        for name in model_names:\n            tmp = None\n            tmp = load_model(os.path.join(dpath ,name))\n            parties.append(tmp)\n    \n    del  X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10, \\\n    X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100,\n    \n    fedmd = FedMD(parties, \n                  public_dataset = public_dataset,\n                  private_data = private_data, \n                  total_private_data = total_private_data,\n                  private_test_data = private_test_data,\n                  N_rounds = N_rounds,\n                  N_alignment = N_alignment, \n                  N_logits_matching_round = N_logits_matching_round,\n                  logits_matching_batchsize = logits_matching_batchsize, \n                  N_private_training_round = N_private_training_round, \n                  private_training_batchsize = private_training_batchsize)\n    \n    initialization_result = fedmd.init_result\n    pooled_train_result = fedmd.pooled_train_result\n    \n    collaboration_performance = fedmd.collaborative_training()\n    \n    if result_save_dir is not None:\n        save_dir_path = os.path.abspath(result_save_dir)\n        #make dir\n        try:\n            os.makedirs(save_dir_path)\n        except OSError as e:\n            if e.errno != errno.EEXIST:\n                raise    \n    \n    \n    with open(os.path.join(save_dir_path, 'pre_train_result.pkl'), 'wb') as f:\n        pickle.dump(pre_train_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'init_result.pkl'), 'wb') as f:\n        pickle.dump(initialization_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'pooled_train_result.pkl'), 'wb') as f:\n        pickle.dump(pooled_train_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'col_performance.pkl'), 'wb') as f:\n        pickle.dump(collaboration_performance, f, protocol=pickle.HIGHEST_PROTOCOL)\n        "
  },
  {
    "path": "CIFAR_Imbalanced.py",
    "content": "import os\nimport errno\nimport argparse\nimport sys\nimport pickle\n\nimport numpy as np\nimport pandas as pd\nfrom tensorflow.keras.models import load_model\n\nfrom data_utils import load_CIFAR_data, load_CIFAR_from_local, generate_partial_data, generate_imbal_CIFAR_private_data\nfrom FedMD import FedMD\nfrom Neural_Networks import train_models, cnn_2layer_fc_model, cnn_3layer_fc_model\n\n\ndef parseArg():\n    parser = argparse.ArgumentParser(description='FedMD, a federated learning framework. \\\n    Participants are training collaboratively. ')\n    parser.add_argument('-conf', metavar='conf_file', nargs=1, \n                        help='the config file for FedMD.'\n                       )\n\n    conf_file = os.path.abspath(\"conf/CIFAR_imbalance_conf.json\")\n    \n    if len(sys.argv) > 1:\n        args = parser.parse_args(sys.argv[1:])\n        if args.conf:\n            conf_file = args.conf[0]\n    return conf_file\n\nCANDIDATE_MODELS = {\"2_layer_CNN\": cnn_2layer_fc_model, \n                    \"3_layer_CNN\": cnn_3layer_fc_model} \n\nif __name__ == \"__main__\":\n    conf_file =  parseArg()\n    with open(conf_file, \"r\") as f:\n        conf_dict = eval(f.read())\n        \n        #n_classes = conf_dict[\"n_classes\"]\n        model_config = conf_dict[\"models\"]\n        pre_train_params = conf_dict[\"pre_train_params\"]\n        model_saved_dir = conf_dict[\"model_saved_dir\"]\n        model_saved_names = conf_dict[\"model_saved_names\"]\n        is_early_stopping = conf_dict[\"early_stopping\"]\n        public_classes = conf_dict[\"public_classes\"]\n        public_classes.sort()\n        private_classes = conf_dict[\"private_classes\"]\n        private_classes.sort()\n        n_classes = len(public_classes) + len(private_classes)\n        \n        emnist_data_dir = conf_dict[\"EMNIST_dir\"]    \n        N_parties = conf_dict[\"N_parties\"]\n        N_samples_per_class = conf_dict[\"N_samples_per_class\"]\n        \n        N_rounds = conf_dict[\"N_rounds\"]\n        N_alignment = conf_dict[\"N_alignment\"]\n        N_private_training_round = conf_dict[\"N_private_training_round\"]\n        private_training_batchsize = conf_dict[\"private_training_batchsize\"]\n        N_logits_matching_round = conf_dict[\"N_logits_matching_round\"]\n        logits_matching_batchsize = conf_dict[\"logits_matching_batchsize\"]\n        \n        \n        result_save_dir = conf_dict[\"result_save_dir\"]\n\n    \n    del conf_dict, conf_file\n    \n    X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10 \\\n    = load_CIFAR_data(data_type=\"CIFAR10\", \n                      standarized = True, verbose = True)\n    \n    public_dataset = {\"X\": X_train_CIFAR10, \"y\": y_train_CIFAR10}\n    \n    \n    X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100 \\\n    = load_CIFAR_data(data_type=\"CIFAR100\",\n                      standarized = True, verbose = True)\n    \n    a_, y_train_super, b_, y_test_super \\\n    = load_CIFAR_data(data_type=\"CIFAR100\", label_mode=\"coarse\",\n                      standarized = True, verbose = True)\n    del a_, b_\n\n\n#     X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10 \\\n#     = load_CIFAR_from_local(local_dir=\"./dataset/CIFAR10/\", \n#                             data_type=\"CIFAR10\", \n#                             standarized = True, verbose = True)\n    \n#     public_dataset = {\"X\": X_train_CIFAR10, \"y\": y_train_CIFAR10}\n    \n    \n#     X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100 \\\n#     = load_CIFAR_from_local(local_dir=\"./dataset/CIFAR100/\", \n#                             data_type=\"CIFAR100\", with_coarse_label = False,\n#                             standarized = True, verbose = True)\n    \n#     a_, y_train_super, b_, y_test_super \\\n#     = load_CIFAR_from_local(local_dir=\"./dataset/CIFAR100/\", \n#                             data_type=\"CIFAR100\", with_coarse_label = True,\n#                             standarized = True, verbose = True)\n#     del a_, b_\n    \n    \n    #Find the relations between superclasses and subclasses\n    relations = [set() for i in range(np.max(y_train_super)+1)]\n    for i, y_fine in enumerate(y_train_CIFAR100):\n        relations[y_train_super[i]].add(y_fine)\n    for i in range(len(relations)):\n        relations[i]=list(relations[i])\n    \n    del i, y_fine\n    #print(relations)\n    #print(np.array(relations).shape)\n    \n    fine_classes_in_use = [[relations[j][i%5] for j in private_classes] \n                           for i in range(N_parties)]\n    print(fine_classes_in_use)\n    \n    #Generate test set\n    X_tmp, y_tmp = generate_partial_data(X_test_CIFAR100, y_test_super,\n                                         class_in_use = private_classes,\n                                         verbose = True)\n    #print(pd.Series(y_tmp).value_counts())\n    \n    # relabel the selected CIFAR100 data for future convenience\n    for index in range(len(private_classes)-1, -1, -1):\n        cls_ = private_classes[index]\n        y_tmp[y_tmp == cls_] = index + len(public_classes)\n    #print(pd.Series(y_tmp).value_counts())    \n    private_test_data = {\"X\": X_tmp, \"y\": y_tmp}\n    del index, cls_, X_tmp, y_tmp\n    print(\"=\"*60)\n    \n    #generate private data\n    private_data, total_private_data \\\n    = generate_imbal_CIFAR_private_data(X_train_CIFAR100, y_train_CIFAR100, y_train_super,   \n                                        N_parties = N_parties,   \n                                        classes_per_party = fine_classes_in_use,\n                                        samples_per_class = N_samples_per_class)\n    for index in range(len(private_classes)-1, -1, -1):\n        cls_ = private_classes[index]\n        total_private_data[\"y\"][total_private_data[\"y\"] == cls_] = index + len(public_classes)\n        for i in range(N_parties):\n            private_data[i][\"y\"][private_data[i][\"y\"] == cls_] = index + len(public_classes)\n    \n    del index, cls_\n\n#     for i in range(N_parties):\n#         print(\"iter:\", i)\n#         print(pd.Series(private_data[i][\"y\"]).value_counts())\n#     print(pd.Series(total_private_data[\"y\"]).value_counts())\n    \n    \n    \n    mod_private_classes = np.arange(len(private_classes)) + len(public_classes)\n        \n    print(\"=\" * 60)\n\n    parties = []\n    if model_saved_dir is None:\n        for i, item in enumerate(model_config):\n            model_name = item[\"model_type\"]\n            model_params = item[\"params\"]\n            tmp = CANDIDATE_MODELS[model_name](n_classes=n_classes, \n                                               input_shape=(32,32,3),\n                                               **model_params)\n            print(\"model {0} : {1}\".format(i, model_saved_names[i]))\n            print(tmp.summary())\n            parties.append(tmp)\n            \n            del model_name, model_params, tmp\n        #END FOR LOOP\n        pre_train_result = train_models(parties, \n                                        X_train_CIFAR10, y_train_CIFAR10, \n                                        X_test_CIFAR10, y_test_CIFAR10,\n                                        save_dir = model_saved_dir, save_names = model_saved_names,\n                                        early_stopping = is_early_stopping,\n                                        **pre_train_params\n                                       )\n    else:\n        dpath = os.path.abspath(model_saved_dir)\n        model_names = os.listdir(dpath)\n        for name in model_names:\n            tmp = None\n            tmp = load_model(os.path.join(dpath ,name))\n            parties.append(tmp)\n    \n    del  X_train_CIFAR10, y_train_CIFAR10, X_test_CIFAR10, y_test_CIFAR10, \\\n    X_train_CIFAR100, y_train_CIFAR100, X_test_CIFAR100, y_test_CIFAR100, y_train_super, y_test_super\n    \n    fedmd = FedMD(parties, \n                  public_dataset = public_dataset,\n                  private_data = private_data, \n                  total_private_data = total_private_data,\n                  private_test_data = private_test_data,\n                  N_rounds = N_rounds,\n                  N_alignment = N_alignment, \n                  N_logits_matching_round = N_logits_matching_round,\n                  logits_matching_batchsize = logits_matching_batchsize, \n                  N_private_training_round = N_private_training_round, \n                  private_training_batchsize = private_training_batchsize)\n    \n    initialization_result = fedmd.init_result\n    pooled_train_result = fedmd.pooled_train_result\n    \n    collaboration_performance = fedmd.collaborative_training()\n    \n    if result_save_dir is not None:\n        save_dir_path = os.path.abspath(result_save_dir)\n        #make dir\n        try:\n            os.makedirs(save_dir_path)\n        except OSError as e:\n            if e.errno != errno.EEXIST:\n                raise    \n    \n    \n    with open(os.path.join(save_dir_path, 'pre_train_result.pkl'), 'wb') as f:\n        pickle.dump(pre_train_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'init_result.pkl'), 'wb') as f:\n        pickle.dump(initialization_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'pooled_train_result.pkl'), 'wb') as f:\n        pickle.dump(pooled_train_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'col_performance.pkl'), 'wb') as f:\n        pickle.dump(collaboration_performance, f, protocol=pickle.HIGHEST_PROTOCOL)\n        "
  },
  {
    "path": "FEMNIST_Balanced.py",
    "content": "import os\nimport errno\nimport argparse\nimport sys\nimport pickle\n\nimport numpy as np\nfrom tensorflow.keras.models import load_model\n\nfrom data_utils import load_MNIST_data, load_EMNIST_data, generate_bal_private_data,\\\ngenerate_partial_data\nfrom FedMD import FedMD\nfrom Neural_Networks import train_models, cnn_2layer_fc_model, cnn_3layer_fc_model\n\n\ndef parseArg():\n    parser = argparse.ArgumentParser(description='FedMD, a federated learning framework. \\\n    Participants are training collaboratively. ')\n    parser.add_argument('-conf', metavar='conf_file', nargs=1, \n                        help='the config file for FedMD.'\n                       )\n\n    conf_file = os.path.abspath(\"conf/EMNIST_balance_conf.json\")\n    \n    if len(sys.argv) > 1:\n        args = parser.parse_args(sys.argv[1:])\n        if args.conf:\n            conf_file = args.conf[0]\n    return conf_file\n\nCANDIDATE_MODELS = {\"2_layer_CNN\": cnn_2layer_fc_model, \n                    \"3_layer_CNN\": cnn_3layer_fc_model} \n\nif __name__ == \"__main__\":\n    conf_file =  parseArg()\n    with open(conf_file, \"r\") as f:\n        conf_dict = eval(f.read())\n        \n        #n_classes = conf_dict[\"n_classes\"]\n        model_config = conf_dict[\"models\"]\n        pre_train_params = conf_dict[\"pre_train_params\"]\n        model_saved_dir = conf_dict[\"model_saved_dir\"]\n        model_saved_names = conf_dict[\"model_saved_names\"]\n        is_early_stopping = conf_dict[\"early_stopping\"]\n        public_classes = conf_dict[\"public_classes\"]\n        private_classes = conf_dict[\"private_classes\"]\n        n_classes = len(public_classes) + len(private_classes)\n        \n        emnist_data_dir = conf_dict[\"EMNIST_dir\"]    \n        N_parties = conf_dict[\"N_parties\"]\n        N_samples_per_class = conf_dict[\"N_samples_per_class\"]\n        \n        N_rounds = conf_dict[\"N_rounds\"]\n        N_alignment = conf_dict[\"N_alignment\"]\n        N_private_training_round = conf_dict[\"N_private_training_round\"]\n        private_training_batchsize = conf_dict[\"private_training_batchsize\"]\n        N_logits_matching_round = conf_dict[\"N_logits_matching_round\"]\n        logits_matching_batchsize = conf_dict[\"logits_matching_batchsize\"]\n        \n        \n        result_save_dir = conf_dict[\"result_save_dir\"]\n\n    \n    del conf_dict, conf_file\n    \n    X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST \\\n    = load_MNIST_data(standarized = True, verbose = True)\n    \n    public_dataset = {\"X\": X_train_MNIST, \"y\": y_train_MNIST}\n    \n    \n    X_train_EMNIST, y_train_EMNIST, X_test_EMNIST, y_test_EMNIST, writer_ids_train, writer_ids_test \\\n    = load_EMNIST_data(emnist_data_dir,\n                       standarized = True, verbose = True)\n    \n    y_train_EMNIST += len(public_classes)\n    y_test_EMNIST += len(public_classes)\n    \n    #generate private data\n    private_data, total_private_data \\\n    = generate_bal_private_data(X_train_EMNIST, y_train_EMNIST, \n                                N_parties = N_parties,             \n                                classes_in_use = private_classes, \n                                N_samples_per_class = N_samples_per_class, \n                                data_overlap = False)\n    \n    X_tmp, y_tmp = generate_partial_data(X = X_test_EMNIST, y= y_test_EMNIST, \n                                         class_in_use = private_classes, verbose = True)\n    private_test_data = {\"X\": X_tmp, \"y\": y_tmp}\n    del X_tmp, y_tmp\n    \n    parties = []\n    if model_saved_dir is None:\n        for i, item in enumerate(model_config):\n            model_name = item[\"model_type\"]\n            model_params = item[\"params\"]\n            tmp = CANDIDATE_MODELS[model_name](n_classes=n_classes, \n                                               input_shape=(28,28),\n                                               **model_params)\n            print(\"model {0} : {1}\".format(i, model_saved_names[i]))\n            print(tmp.summary())\n            parties.append(tmp)\n            \n            del model_name, model_params, tmp\n        #END FOR LOOP\n        pre_train_result = train_models(parties, \n                                        X_train_MNIST, y_train_MNIST, \n                                        X_test_MNIST, y_test_MNIST,\n                                        save_dir = model_saved_dir, save_names = model_saved_names,\n                                        early_stopping = is_early_stopping,\n                                        **pre_train_params\n                                       )\n    else:\n        dpath = os.path.abspath(model_saved_dir)\n        model_names = os.listdir(dpath)\n        for name in model_names:\n            tmp = None\n            tmp = load_model(os.path.join(dpath ,name))\n            parties.append(tmp)\n    \n    del  X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST, \\\n    X_train_EMNIST, y_train_EMNIST, X_test_EMNIST, y_test_EMNIST, writer_ids_train, writer_ids_test\n    \n    \n    fedmd = FedMD(parties, \n                  public_dataset = public_dataset,\n                  private_data = private_data, \n                  total_private_data = total_private_data,\n                  private_test_data = private_test_data,\n                  N_rounds = N_rounds,\n                  N_alignment = N_alignment, \n                  N_logits_matching_round = N_logits_matching_round,\n                  logits_matching_batchsize = logits_matching_batchsize, \n                  N_private_training_round = N_private_training_round, \n                  private_training_batchsize = private_training_batchsize)\n    \n    initialization_result = fedmd.init_result\n    pooled_train_result = fedmd.pooled_train_result\n    \n    collaboration_performance = fedmd.collaborative_training()\n    \n    if result_save_dir is not None:\n        save_dir_path = os.path.abspath(result_save_dir)\n        #make dir\n        try:\n            os.makedirs(save_dir_path)\n        except OSError as e:\n            if e.errno != errno.EEXIST:\n                raise    \n    \n    \n    with open(os.path.join(save_dir_path, 'pre_train_result.pkl'), 'wb') as f:\n        pickle.dump(pre_train_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'init_result.pkl'), 'wb') as f:\n        pickle.dump(initialization_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'pooled_train_result.pkl'), 'wb') as f:\n        pickle.dump(pooled_train_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'col_performance.pkl'), 'wb') as f:\n        pickle.dump(collaboration_performance, f, protocol=pickle.HIGHEST_PROTOCOL)\n        "
  },
  {
    "path": "FEMNIST_Imbalanced.py",
    "content": "import os\nimport errno\nimport argparse\nimport sys\nimport pickle\n\nimport numpy as np\nfrom tensorflow.keras.models import load_model\n\nfrom data_utils import load_MNIST_data, load_EMNIST_data, generate_EMNIST_writer_based_data, generate_partial_data\nfrom FedMD import FedMD\nfrom Neural_Networks import train_models, cnn_2layer_fc_model, cnn_3layer_fc_model\n\n\ndef parseArg():\n    parser = argparse.ArgumentParser(description='FedMD, a federated learning framework. \\\n    Participants are training collaboratively. ')\n    parser.add_argument('-conf', metavar='conf_file', nargs=1, \n                        help='the config file for FedMD.'\n                       )\n\n    conf_file = os.path.abspath(\"conf/EMNIST_imbalance_conf.json\")\n    \n    if len(sys.argv) > 1:\n        args = parser.parse_args(sys.argv[1:])\n        if args.conf:\n            conf_file = args.conf[0]\n    return conf_file\n\nCANDIDATE_MODELS = {\"2_layer_CNN\": cnn_2layer_fc_model, \n                    \"3_layer_CNN\": cnn_3layer_fc_model} \n\nif __name__ == \"__main__\":\n    conf_file =  parseArg()\n    with open(conf_file, \"r\") as f:\n        conf_dict = eval(f.read())\n        \n        #n_classes = conf_dict[\"n_classes\"]\n        model_config = conf_dict[\"models\"]\n        pre_train_params = conf_dict[\"pre_train_params\"]\n        model_saved_dir = conf_dict[\"model_saved_dir\"]\n        model_saved_names = conf_dict[\"model_saved_names\"]\n        is_early_stopping = conf_dict[\"early_stopping\"]\n        public_classes = conf_dict[\"public_classes\"]\n        private_classes = conf_dict[\"private_classes\"]\n        n_classes = len(public_classes) + len(private_classes)\n        \n        emnist_data_dir = conf_dict[\"EMNIST_dir\"]    \n        N_parties = conf_dict[\"N_parties\"]\n        N_samples_per_class = conf_dict[\"N_samples_per_class\"]\n        \n        N_rounds = conf_dict[\"N_rounds\"]\n        N_alignment = conf_dict[\"N_alignment\"]\n        N_private_training_round = conf_dict[\"N_private_training_round\"]\n        private_training_batchsize = conf_dict[\"private_training_batchsize\"]\n        N_logits_matching_round = conf_dict[\"N_logits_matching_round\"]\n        logits_matching_batchsize = conf_dict[\"logits_matching_batchsize\"]\n        \n        \n        result_save_dir = conf_dict[\"result_save_dir\"]\n\n    \n    del conf_dict, conf_file\n    \n    X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST \\\n    = load_MNIST_data(standarized = True, verbose = True)\n    \n    public_dataset = {\"X\": X_train_MNIST, \"y\": y_train_MNIST}\n    \n    \n    X_train_EMNIST, y_train_EMNIST, X_test_EMNIST, y_test_EMNIST, \\\n    writer_ids_train_EMNIST, writer_ids_test_EMNIST \\\n    = load_EMNIST_data(emnist_data_dir,\n                       standarized = True, verbose = True)\n    \n    y_train_EMNIST += len(public_classes)\n    y_test_EMNIST += len(public_classes)\n    \n    #generate private data\n    private_data, total_private_data\\\n    =generate_EMNIST_writer_based_data(X_train_EMNIST, y_train_EMNIST,\n                                       writer_ids_train_EMNIST,\n                                       N_parties = N_parties, \n                                       classes_in_use = private_classes, \n                                       N_priv_data_min = N_samples_per_class * len(private_classes)\n                                      )\n    \n    X_tmp, y_tmp = generate_partial_data(X = X_test_EMNIST, y= y_test_EMNIST, \n                                         class_in_use = private_classes, verbose = True)\n    private_test_data = {\"X\": X_tmp, \"y\": y_tmp}\n    del X_tmp, y_tmp\n    \n    parties = []\n    if model_saved_dir is None:\n        for i, item in enumerate(model_config):\n            model_name = item[\"model_type\"]\n            model_params = item[\"params\"]\n            tmp = CANDIDATE_MODELS[model_name](n_classes=n_classes, \n                                               input_shape=(28,28),\n                                               **model_params)\n            print(\"model {0} : {1}\".format(i, model_saved_names[i]))\n            print(tmp.summary())\n            parties.append(tmp)\n            \n            del model_name, model_params, tmp\n        #END FOR LOOP\n        pre_train_result = train_models(parties, \n                                        X_train_MNIST, y_train_MNIST, \n                                        X_test_MNIST, y_test_MNIST,\n                                        save_dir = model_saved_dir, save_names = model_saved_names,\n                                        early_stopping = is_early_stopping,\n                                        **pre_train_params\n                                       )\n    else:\n        dpath = os.path.abspath(model_saved_dir)\n        model_names = os.listdir(dpath)\n        for name in model_names:\n            tmp = None\n            tmp = load_model(os.path.join(dpath ,name))\n            parties.append(tmp)\n    \n    del  X_train_MNIST, y_train_MNIST, X_test_MNIST, y_test_MNIST, \\\n    X_train_EMNIST, y_train_EMNIST, X_test_EMNIST, y_test_EMNIST, writer_ids_train_EMNIST, writer_ids_test_EMNIST\n    \n    \n    fedmd = FedMD(parties, \n                  public_dataset = public_dataset,\n                  private_data = private_data, \n                  total_private_data = total_private_data,\n                  private_test_data = private_test_data,\n                  N_rounds = N_rounds,\n                  N_alignment = N_alignment, \n                  N_logits_matching_round = N_logits_matching_round,\n                  logits_matching_batchsize = logits_matching_batchsize, \n                  N_private_training_round = N_private_training_round, \n                  private_training_batchsize = private_training_batchsize)\n    \n    initialization_result = fedmd.init_result\n    pooled_train_result = fedmd.pooled_train_result\n    \n    collaboration_performance = fedmd.collaborative_training()\n    \n    if result_save_dir is not None:\n        save_dir_path = os.path.abspath(result_save_dir)\n        #make dir\n        try:\n            os.makedirs(save_dir_path)\n        except OSError as e:\n            if e.errno != errno.EEXIST:\n                raise    \n    \n    \n    with open(os.path.join(save_dir_path, 'pre_train_result.pkl'), 'wb') as f:\n        pickle.dump(pre_train_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'init_result.pkl'), 'wb') as f:\n        pickle.dump(initialization_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'pooled_train_result.pkl'), 'wb') as f:\n        pickle.dump(pooled_train_result, f, protocol=pickle.HIGHEST_PROTOCOL)\n    with open(os.path.join(save_dir_path, 'col_performance.pkl'), 'wb') as f:\n        pickle.dump(collaboration_performance, f, protocol=pickle.HIGHEST_PROTOCOL)\n        "
  },
  {
    "path": "FedMD.py",
    "content": "import numpy as np\nfrom tensorflow.keras.models import clone_model, load_model\nfrom tensorflow.keras.callbacks import EarlyStopping\nimport tensorflow as tf\n\nfrom data_utils import generate_alignment_data\nfrom Neural_Networks import remove_last_layer\n\nclass FedMD():\n    def __init__(self, parties, public_dataset, \n                 private_data, total_private_data,  \n                 private_test_data, N_alignment,\n                 N_rounds, \n                 N_logits_matching_round, logits_matching_batchsize, \n                 N_private_training_round, private_training_batchsize):\n        \n        self.N_parties = len(parties)\n        self.public_dataset = public_dataset\n        self.private_data = private_data\n        self.private_test_data = private_test_data\n        self.N_alignment = N_alignment\n        \n        self.N_rounds = N_rounds\n        self.N_logits_matching_round = N_logits_matching_round\n        self.logits_matching_batchsize = logits_matching_batchsize\n        self.N_private_training_round = N_private_training_round\n        self.private_training_batchsize = private_training_batchsize\n        \n        self.collaborative_parties = []\n        self.init_result = []\n        \n        print(\"start model initialization: \")\n        for i in range(self.N_parties):\n            print(\"model \", i)\n            model_A_twin = None\n            model_A_twin = clone_model(parties[i])\n            model_A_twin.set_weights(parties[i].get_weights())\n            model_A_twin.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), \n                                 loss = \"sparse_categorical_crossentropy\",\n                                 metrics = [\"accuracy\"])\n            \n            print(\"start full stack training ... \")        \n            \n            model_A_twin.fit(private_data[i][\"X\"], private_data[i][\"y\"],\n                             batch_size = 32, epochs = 25, shuffle=True, verbose = 0,\n                             validation_data = [private_test_data[\"X\"], private_test_data[\"y\"]],\n                             callbacks=[EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=10)]\n                            )\n            \n            print(\"full stack training done\")\n            \n            model_A = remove_last_layer(model_A_twin, loss=\"mean_absolute_error\")\n            \n            self.collaborative_parties.append({\"model_logits\": model_A, \n                                               \"model_classifier\": model_A_twin,\n                                               \"model_weights\": model_A_twin.get_weights()})\n            \n            self.init_result.append({\"val_acc\": model_A_twin.history.history['val_accuracy'],\n                                     \"train_acc\": model_A_twin.history.history['accuracy'],\n                                     \"val_loss\": model_A_twin.history.history['val_loss'],\n                                     \"train_loss\": model_A_twin.history.history['loss'],\n                                    })\n            \n            print()\n            del model_A, model_A_twin\n        #END FOR LOOP\n        \n        print(\"calculate the theoretical upper bounds for participants: \")\n        \n        self.upper_bounds = []\n        self.pooled_train_result = []\n        for model in parties:\n            model_ub = clone_model(model)\n            model_ub.set_weights(model.get_weights())\n            model_ub.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3),\n                             loss = \"sparse_categorical_crossentropy\", \n                             metrics = [\"accuracy\"])\n            \n            model_ub.fit(total_private_data[\"X\"], total_private_data[\"y\"],\n                         batch_size = 32, epochs = 50, shuffle=True, verbose = 0, \n                         validation_data = [private_test_data[\"X\"], private_test_data[\"y\"]],\n                         callbacks=[EarlyStopping(monitor=\"val_accuracy\", min_delta=0.001, patience=10)])\n            \n            self.upper_bounds.append(model_ub.history.history[\"val_accuracy\"][-1])\n            self.pooled_train_result.append({\"val_acc\": model_ub.history.history[\"val_accuracy\"], \n                                             \"acc\": model_ub.history.history[\"accuracy\"]})\n            \n            del model_ub    \n        print(\"the upper bounds are:\", self.upper_bounds)\n    \n    def collaborative_training(self):\n        # start collaborating training    \n        collaboration_performance = {i: [] for i in range(self.N_parties)}\n        r = 0\n        while True:\n            # At beginning of each round, generate new alignment dataset\n            alignment_data = generate_alignment_data(self.public_dataset[\"X\"], \n                                                     self.public_dataset[\"y\"],\n                                                     self.N_alignment)\n            \n            print(\"round \", r)\n            \n            print(\"update logits ... \")\n            # update logits\n            logits = 0\n            for d in self.collaborative_parties:\n                d[\"model_logits\"].set_weights(d[\"model_weights\"])\n                logits += d[\"model_logits\"].predict(alignment_data[\"X\"], verbose = 0)\n                \n            logits /= self.N_parties\n            \n            # test performance\n            print(\"test performance ... \")\n            \n            for index, d in enumerate(self.collaborative_parties):\n                y_pred = d[\"model_classifier\"].predict(self.private_test_data[\"X\"], verbose = 0).argmax(axis = 1)\n                collaboration_performance[index].append(np.mean(self.private_test_data[\"y\"] == y_pred))\n                \n                print(collaboration_performance[index][-1])\n                del y_pred\n                \n                \n            r+= 1\n            if r > self.N_rounds:\n                break\n                \n                \n            print(\"updates models ...\")\n            for index, d in enumerate(self.collaborative_parties):\n                print(\"model {0} starting alignment with public logits... \".format(index))\n                \n                \n                weights_to_use = None\n                weights_to_use = d[\"model_weights\"]\n\n                d[\"model_logits\"].set_weights(weights_to_use)\n                d[\"model_logits\"].fit(alignment_data[\"X\"], logits, \n                                      batch_size = self.logits_matching_batchsize,  \n                                      epochs = self.N_logits_matching_round, \n                                      shuffle=True, verbose = 0)\n                d[\"model_weights\"] = d[\"model_logits\"].get_weights()\n                print(\"model {0} done alignment\".format(index))\n\n                print(\"model {0} starting training with private data... \".format(index))\n                weights_to_use = None\n                weights_to_use = d[\"model_weights\"]\n                d[\"model_classifier\"].set_weights(weights_to_use)\n                d[\"model_classifier\"].fit(self.private_data[index][\"X\"], \n                                          self.private_data[index][\"y\"],       \n                                          batch_size = self.private_training_batchsize, \n                                          epochs = self.N_private_training_round, \n                                          shuffle=True, verbose = 0)\n\n                d[\"model_weights\"] = d[\"model_classifier\"].get_weights()\n                print(\"model {0} done private training. \\n\".format(index))\n            #END FOR LOOP\n        \n        #END WHILE LOOP\n        return collaboration_performance\n\n\n        "
  },
  {
    "path": "Neural_Networks.py",
    "content": "from tensorflow.keras.models import Model, Sequential, clone_model, load_model\nfrom tensorflow.keras.layers import Input, Dense, add, concatenate, Conv2D,Dropout,\\\nBatchNormalization, Flatten, MaxPooling2D, AveragePooling2D, Activation, Dropout, Reshape\nfrom tensorflow.keras.callbacks import EarlyStopping\nimport tensorflow as tf\n\n\ndef cnn_3layer_fc_model(n_classes,n1 = 128, n2=192, n3=256, dropout_rate = 0.2,input_shape = (28,28)):\n    model_A, x = None, None\n     \n    x = Input(input_shape)\n    if len(input_shape)==2: \n        y = Reshape((input_shape[0], input_shape[1], 1))(x)\n    else:\n        y = Reshape(input_shape)(x)\n    y = Conv2D(filters = n1, kernel_size = (3,3), strides = 1, padding = \"same\", \n            activation = None)(y)\n    y = BatchNormalization()(y)\n    y = Activation(\"relu\")(y)\n    y = Dropout(dropout_rate)(y)\n    y = AveragePooling2D(pool_size = (2,2), strides = 1, padding = \"same\")(y)\n\n    y = Conv2D(filters = n2, kernel_size = (2,2), strides = 2, padding = \"valid\", \n            activation = None)(y)\n    y = BatchNormalization()(y)\n    y = Activation(\"relu\")(y)\n    y = Dropout(dropout_rate)(y)\n    y = AveragePooling2D(pool_size = (2,2), strides = 2, padding = \"valid\")(y)\n\n    y = Conv2D(filters = n3, kernel_size = (3,3), strides = 2, padding = \"valid\", \n            activation = None)(y)\n    y = BatchNormalization()(y)\n    y = Activation(\"relu\")(y)\n    y = Dropout(dropout_rate)(y)\n    #y = AveragePooling2D(pool_size = (2,2), strides = 2, padding = \"valid\")(y)\n\n    y = Flatten()(y)\n    y = Dense(units = n_classes, activation = None, use_bias = False,\n            kernel_regularizer=tf.keras.regularizers.l2(1e-3))(y)\n    y = Activation(\"softmax\")(y)\n\n\n    model_A = Model(inputs = x, outputs = y)\n\n    model_A.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), \n                        loss = \"sparse_categorical_crossentropy\",\n                        metrics = [\"accuracy\"])\n    return model_A\n  \ndef cnn_2layer_fc_model(n_classes,n1 = 128, n2=256, dropout_rate = 0.2,input_shape = (28,28)):\n    model_A, x = None, None\n    \n    x = Input(input_shape)\n    if len(input_shape)==2: \n        y = Reshape((input_shape[0], input_shape[1], 1))(x)\n    else:\n        y = Reshape(input_shape)(x)\n    y = Conv2D(filters = n1, kernel_size = (3,3), strides = 1, padding = \"same\", \n            activation = None)(y)\n    y = BatchNormalization()(y)\n    y = Activation(\"relu\")(y)\n    y = Dropout(dropout_rate)(y)\n    y = AveragePooling2D(pool_size = (2,2), strides = 1, padding = \"same\")(y)\n\n\n    y = Conv2D(filters = n2, kernel_size = (3,3), strides = 2, padding = \"valid\", \n            activation = None)(y)\n    y = BatchNormalization()(y)\n    y = Activation(\"relu\")(y)\n    y = Dropout(dropout_rate)(y)\n    #y = AveragePooling2D(pool_size = (2,2), strides = 2, padding = \"valid\")(y)\n\n    y = Flatten()(y)\n    y = Dense(units = n_classes, activation = None, use_bias = False,\n            kernel_regularizer=tf.keras.regularizers.l2(1e-3))(y)\n    y = Activation(\"softmax\")(y)\n\n\n    model_A = Model(inputs = x, outputs = y)\n\n    model_A.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), \n                        loss = \"sparse_categorical_crossentropy\",\n                        metrics = [\"accuracy\"])\n    return model_A\n\n\ndef remove_last_layer(model, loss = \"mean_absolute_error\"):\n    \"\"\"\n    Input: Keras model, a classification model whose last layer is a softmax activation\n    Output: Keras model, the same model with the last softmax activation layer removed,\n        while keeping the same parameters \n    \"\"\"\n    \n    new_model = Model(inputs = model.inputs, outputs = model.layers[-2].output)\n    new_model.set_weights(model.get_weights())\n    new_model.compile(optimizer=tf.keras.optimizers.Adam(lr = 1e-3), \n                      loss = loss)\n    \n    return new_model\n\n\n\ndef train_models(models, X_train, y_train, X_test, y_test, \n                 save_dir = \"./\", save_names = None,\n                 early_stopping = True, min_delta = 0.001, patience = 3, \n                 batch_size = 128, epochs = 20, is_shuffle=True, verbose = 1\n                ):\n    '''\n    Train an array of models on the same dataset. \n    We use early termination to speed up training. \n    '''\n    \n    resulting_val_acc = []\n    record_result = []\n    for n, model in enumerate(models):\n        print(\"Training model \", n)\n        if early_stopping:\n            model.fit(X_train, y_train, \n                      validation_data = [X_test, y_test],\n                      callbacks=[EarlyStopping(monitor='val_accuracy', min_delta=min_delta, patience=patience)],\n                      batch_size = batch_size, epochs = epochs, shuffle=is_shuffle, verbose = verbose\n                     )\n        else:\n            model.fit(X_train, y_train, \n                      validation_data = [X_test, y_test],\n                      batch_size = batch_size, epochs = epochs, shuffle=is_shuffle, verbose = verbose\n                     )\n        \n        resulting_val_acc.append(model.history.history[\"val_accuracy\"][-1])\n        record_result.append({\"train_acc\": model.history.history[\"accuracy\"], \n                              \"val_acc\": model.history.history[\"val_accuracy\"],\n                              \"train_loss\": model.history.history[\"loss\"], \n                              \"val_loss\": model.history.history[\"val_loss\"]})\n        \n        if save_dir is not None:\n            save_dir_path = os.path.abspath(save_dir)\n            #make dir\n            try:\n                os.makedirs(save_dir_path)\n            except OSError as e:\n                if e.errno != errno.EEXIST:\n                    raise    \n\n            if save_names is None:\n                file_name = save_dir + \"model_{0}\".format(n) + \".h5\"\n            else:\n                file_name = save_dir + save_names[n] + \".h5\"\n            model.save(file_name)\n    \n    print(\"pre-train accuracy: \")\n    print(resulting_val_acc)\n        \n    return record_result"
  },
  {
    "path": "README.md",
    "content": "# FedMD\nFedMD: Heterogenous Federated Learning via Model Distillation. \nPreprint on https://arxiv.org/abs/1910.03581.\n\n## Run scripts on Google Colab\n\n1. open a google Colab\n\n2. Clone the project folder from Github\n```\n! git clone github_link\n```\n\n3. Then access the folder just created. \n```\n% cd project_folder/\n```\n\n4. Run the python script in Colab. For instance \n``` \n! python FEMNIST_Balanced.py -conf conf/EMNIST_balance_conf.json\n```\n"
  },
  {
    "path": "conf/CIFAR_balance_conf.json",
    "content": "{\n    \"models\": [{\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 384, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, 'n2': 512, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 256, \"n2\": 256, \"dropout_rate\": 0.3}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 256, \"n2\": 512, \"dropout_rate\": 0.4}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 64, \"n2\": 128, \"n3\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 64, \"n2\": 128, \"n3\": 192, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 192, \"n3\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 128, \"n3\": 128, \"dropout_rate\": 0.3}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 128, \"n3\": 192, \"dropout_rate\": 0.3}}\n              ],\n    \"pre_train_params\": {\"min_delta\": 0.005, \"patience\": 3,\n                     \"batch_size\": 128, \"epochs\": 20, \"is_shuffle\": True, \n                     \"verbose\": 1},\n    \"model_saved_dir\": None,\n    \"model_saved_names\" : [\"CNN_128_256\", \"CNN_128_384\", \"CNN_128_512\", \"CNN_256_256\", \"CNN_256_512\", \n                    \"CNN_64_128_256\", \"CNN_64_128_192\", \"CNN_128_192_256\", \"CNN_128_128_128\", \"CNN_128_128_192\"],\n    \"early_stopping\" : True,\n    \"N_parties\": 10,\n    \"N_samples_per_class\": 3,\n    \"N_alignment\": 5000, \n    \"private_classes\": [0,2,20,63,71,82],\n    \"public_classes\": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n    \"is_show\": False,\n    \"N_rounds\": 20,\n    \"N_logits_matching_round\": 1, \n    \"N_private_training_round\": 4,\n    \"private_training_batchsize\" : 5, \n    \"logits_matching_batchsize\": 256, \n    \"EMNIST_dir\": None,\n    \"result_save_dir\": \"./result_CIFAR_balanced/\"\n}\n"
  },
  {
    "path": "conf/CIFAR_imbalance_conf.json",
    "content": "{\n    \"models\": [{\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 384, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, 'n2': 512, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 256, \"n2\": 256, \"dropout_rate\": 0.3}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 256, \"n2\": 512, \"dropout_rate\": 0.4}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 64, \"n2\": 128, \"n3\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 64, \"n2\": 128, \"n3\": 192, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 192, \"n3\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 128, \"n3\": 128, \"dropout_rate\": 0.3}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 128, \"n3\": 192, \"dropout_rate\": 0.3}}\n              ],\n    \"pre_train_params\": {\"min_delta\": 0.005, \"patience\": 3,\n                     \"batch_size\": 128, \"epochs\": 20, \"is_shuffle\": True, \n                     \"verbose\": 1},\n    \"model_saved_dir\": None,\n    \"model_saved_names\" : [\"CNN_128_256\", \"CNN_128_384\", \"CNN_128_512\", \"CNN_256_256\", \"CNN_256_512\", \n                    \"CNN_64_128_256\", \"CNN_64_128_192\", \"CNN_128_192_256\", \"CNN_128_128_128\", \"CNN_128_128_192\"],\n    \"early_stopping\" : True,\n    \"N_parties\": 10,\n    \"N_samples_per_class\": 20,\n    \"N_alignment\": 5000, \n    \"private_classes\": [0,1,7,9,12,18],\n    \"public_classes\": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n    \"is_show\": False,\n    \"N_rounds\": 13,\n    \"N_logits_matching_round\": 1, \n    \"N_private_training_round\": 10,\n    \"private_training_batchsize\" : 10, \n    \"logits_matching_batchsize\": 128, \n    \"EMNIST_dir\": None,\n    \"result_save_dir\": \"./result_CIFAR_imbalanced/\"\n}"
  },
  {
    "path": "conf/EMNIST_balance_conf.json",
    "content": "{\n    \"models\": [{\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 384, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, 'n2': 512, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 256, \"n2\": 256, \"dropout_rate\": 0.3}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 256, \"n2\": 512, \"dropout_rate\": 0.4}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 64, \"n2\": 128, \"n3\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 64, \"n2\": 128, \"n3\": 192, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 192, \"n3\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 128, \"n3\": 128, \"dropout_rate\": 0.3}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 128, \"n3\": 192, \"dropout_rate\": 0.3}}\n              ],\n    \"pre_train_params\": {\"min_delta\": 0.001, \"patience\": 3,\n                     \"batch_size\": 128, \"epochs\": 20, \"is_shuffle\": True, \n                     \"verbose\": 1},\n    \"model_saved_dir\": None,\n    \"model_saved_names\" : [\"CNN_128_256\", \"CNN_128_384\", \"CNN_128_512\", \"CNN_256_256\", \"CNN_256_512\", \n                    \"CNN_64_128_256\", \"CNN_64_128_192\", \"CNN_128_192_256\", \"CNN_128_128_128\", \"CNN_128_128_192\"],\n    \"early_stopping\" : True,\n    \"N_parties\": 10,\n    \"N_samples_per_class\": 3,\n    \"N_alignment\": 5000, \n    \"private_classes\": [10, 11, 12, 13, 14, 15],\n    \"public_classes\": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n    \"is_show\": False,\n    \"N_rounds\": 20,\n    \"N_logits_matching_round\": 1, \n    \"N_private_training_round\": 2,\n    \"private_training_batchsize\" : 5, \n    \"logits_matching_batchsize\": 256, \n    \"EMNIST_dir\": \"./dataset/emnist-letters.mat\",\n    \"result_save_dir\": \"./result_FEMNIST_balanced/\"\n}"
  },
  {
    "path": "conf/EMNIST_imbalance_conf.json",
    "content": "{\n    \"models\": [{\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 384, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 128, 'n2': 512, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 256, \"n2\": 256, \"dropout_rate\": 0.3}},\n               {\"model_type\": \"2_layer_CNN\", \"params\": {\"n1\": 256, \"n2\": 512, \"dropout_rate\": 0.4}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 64, \"n2\": 128, \"n3\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 64, \"n2\": 128, \"n3\": 192, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 192, \"n3\": 256, \"dropout_rate\": 0.2}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 128, \"n3\": 128, \"dropout_rate\": 0.3}},\n               {\"model_type\": \"3_layer_CNN\", \"params\": {\"n1\": 128, \"n2\": 128, \"n3\": 192, \"dropout_rate\": 0.3}}\n              ],\n    \"pre_train_params\": {\"min_delta\": 0.001, \"patience\": 3,\n                     \"batch_size\": 128, \"epochs\": 20, \"is_shuffle\": True, \n                     \"verbose\": 1},\n    \"model_saved_dir\": None,\n    \"model_saved_names\" : [\"CNN_128_256\", \"CNN_128_384\", \"CNN_128_512\", \"CNN_256_256\", \"CNN_256_512\", \n                    \"CNN_64_128_256\", \"CNN_64_128_192\", \"CNN_128_192_256\", \"CNN_128_128_128\", \"CNN_128_128_192\"],\n    \"early_stopping\" : True,\n    \"N_parties\": 10,\n    \"N_samples_per_class\": 3,\n    \"N_alignment\": 5000, \n    \"private_classes\": [10, 11, 12, 13, 14, 15],\n    \"public_classes\": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n    \"is_show\": False,\n    \"N_rounds\": 20,\n    \"N_logits_matching_round\": 1, \n    \"N_private_training_round\": 4,\n    \"private_training_batchsize\" : 5, \n    \"logits_matching_batchsize\": 256, \n    \"EMNIST_dir\": \"./dataset/emnist-letters.mat\",\n    \"result_save_dir\": \"./result_FEMNIST_imbalanced/\"\n}"
  },
  {
    "path": "data_utils.py",
    "content": "import pickle\nimport os\n\nimport numpy as np\nimport pandas as pd\nfrom sklearn.model_selection import StratifiedShuffleSplit\nfrom tensorflow.keras.datasets import cifar10, cifar100, mnist\nimport scipy.io as sio\n\n\ndef load_MNIST_data(standarized = False, verbose = False):\n    (X_train, y_train), (X_test, y_test) = mnist.load_data()\n    \n    if standarized: \n        X_train = X_train/255\n        X_test = X_test/255\n        mean_image = np.mean(X_train, axis=0)\n        X_train -= mean_image\n        X_test -= mean_image\n    \n    if verbose == True: \n        print(\"MNIST dataset ... \")\n        print(\"X_train shape :\", X_train.shape)\n        print(\"X_test shape :\", X_test.shape)\n        print(\"y_train shape :\", y_train.shape)\n        print(\"y_test shape :\", y_test.shape)\n    \n    return X_train, y_train, X_test, y_test\n\n\ndef load_EMNIST_data(file, verbose = False, standarized = False):\n    \"\"\"\n    file should be the downloaded EMNIST file in .mat format.\n    \"\"\"    \n    mat = sio.loadmat(file)\n    data = mat[\"dataset\"]\n    \n    \n    \n    writer_ids_train = data['train'][0,0]['writers'][0,0]\n    writer_ids_train = np.squeeze(writer_ids_train)\n    X_train = data['train'][0,0]['images'][0,0]\n    X_train = X_train.reshape((X_train.shape[0], 28, 28), order = \"F\")\n    y_train = data['train'][0,0]['labels'][0,0]\n    y_train = np.squeeze(y_train)\n    y_train -= 1 #y_train is zero-based\n    \n    writer_ids_test = data['test'][0,0]['writers'][0,0]\n    writer_ids_test = np.squeeze(writer_ids_test)\n    X_test = data['test'][0,0]['images'][0,0]\n    X_test= X_test.reshape((X_test.shape[0], 28, 28), order = \"F\")\n    y_test = data['test'][0,0]['labels'][0,0]\n    y_test = np.squeeze(y_test)\n    y_test -= 1 #y_test is zero-based\n\n    \n    if standarized: \n        X_train = X_train/255\n        X_test = X_test/255\n        mean_image = np.mean(X_train, axis=0)\n        X_train -= mean_image\n        X_test -= mean_image\n    \n\n    if verbose == True: \n        print(\"EMNIST-letter dataset ... \")\n        print(\"X_train shape :\", X_train.shape)\n        print(\"X_test shape :\", X_test.shape)\n        print(\"y_train shape :\", y_train.shape)\n        print(\"y_test shape :\", y_test.shape)\n    \n    return X_train, y_train, X_test, y_test, writer_ids_train, writer_ids_test\n\n\ndef load_CIFAR_data(data_type=\"CIFAR10\", label_mode=\"fine\",\n                    standarized = False, verbose = False):    \n    if data_type == \"CIFAR10\":\n        (X_train, y_train), (X_test, y_test) = cifar10.load_data()\n    elif data_type == \"CIFAR100\":\n        (X_train, y_train), (X_test, y_test) = cifar100.load_data(label_mode = label_mode)\n    else:\n        print(\"Unknown Data type. Stopped!\")\n        return None\n      \n    \n    y_train = np.squeeze(y_train)\n    y_test = np.squeeze(y_test)\n    # substract mean and normalized to [-1/2,1/2]\n    if standarized: \n        X_train = X_train/255\n        X_test = X_test/255\n        mean_image = np.mean(X_train, axis=0)\n        X_train -= mean_image\n        X_test -= mean_image\n        \n    \n    \n    if verbose == True: \n        print(\"X_train shape :\", X_train.shape)\n        print(\"X_test shape :\", X_test.shape)\n        print(\"y_train shape :\", y_train.shape)\n        print(\"y_test shape :\", y_test.shape)\n    \n    return X_train, y_train, X_test, y_test\n\ndef load_CIFAR_from_local(local_dir, data_type=\"CIFAR10\", with_coarse_label = False, \n                          standarized = False, verbose = False):\n    #dir_name = os.path.abspath(local_dir)\n    if data_type == \"CIFAR10\":\n        X_train, y_train = [], [] \n        for i in range(1, 6, 1):\n            file_name = None\n            file_name = os.path.join(local_dir + \"data_batch_{0}\".format(i))\n            X_tmp, y_tmp = None, None\n            with open(file_name, 'rb') as fo:\n                datadict = pickle.load(fo, encoding='bytes')\n            \n            X_tmp = datadict[b'data']\n            y_tmp = datadict[b'labels']\n            X_tmp = X_tmp.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype(\"float\")\n            y_tmp = np.array(y_tmp)\n            \n            X_train.append(X_tmp)\n            y_train.append(y_tmp)\n            del X_tmp, y_tmp\n        X_train = np.vstack(X_train)\n        y_train = np.hstack(y_train)\n        \n        file_name = None\n        file_name = os.path.join(local_dir + \"test_batch\")\n        with open(file_name, 'rb') as fo:\n            datadict = pickle.load(fo, encoding='bytes')\n            \n            X_test = datadict[b'data']\n            y_test = datadict[b'labels']\n            X_test = X_test.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype(\"float\")\n            y_test = np.array(y_test)\n            \n            \n    elif data_type == \"CIFAR100\":\n        file_name = None \n        file_name = os.path.abspath(local_dir + \"train\")\n        with open(file_name, 'rb') as fo:\n            datadict = pickle.load(fo, encoding='bytes')\n            X_train = datadict[b'data']\n            if with_coarse_label:\n                y_train = datadict[b'coarse_labels']\n            else:\n                y_train = datadict[b'fine_labels']\n            X_train = X_train.reshape(50000, 3, 32, 32).transpose(0,2,3,1).astype(\"float\")\n            y_train = np.array(y_train)\n        \n        file_name = None \n        file_name = os.path.join(local_dir + \"test\")\n        with open(file_name, 'rb') as fo:\n            datadict = pickle.load(fo, encoding='bytes')\n            X_test = datadict[b'data']\n            if with_coarse_label:\n                y_test = datadict[b'coarse_labels']\n            else:\n                y_test = datadict[b'fine_labels']\n            X_test = X_test.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype(\"float\")\n            y_test = np.array(y_test)\n        \n    else:\n        print(\"Unknown Data type. Stopped!\")\n        return None   \n    \n    if standarized: \n        X_train = X_train/255\n        X_test = X_test/255\n        mean_image = np.mean(X_train, axis=0)\n        X_train -= mean_image\n        X_test -= mean_image\n    \n    if verbose == True: \n        print(\"X_train shape :\", X_train.shape)\n        print(\"X_test shape :\", X_test.shape)\n        print(\"y_train shape :\", y_train.shape)\n        print(\"y_test shape :\", y_test.shape)\n        \n    return X_train, y_train, X_test, y_test\n\n\n\n\ndef generate_partial_data(X, y, class_in_use = None, verbose = False):\n    if class_in_use is None:\n        idx = np.ones_like(y, dtype = bool)\n    else:\n        idx = [y == i for i in class_in_use]\n        idx = np.any(idx, axis = 0)\n    X_incomplete, y_incomplete = X[idx], y[idx]\n    if verbose == True:\n        print(\"X shape :\", X_incomplete.shape)\n        print(\"y shape :\", y_incomplete.shape)\n    return X_incomplete, y_incomplete\n\n\n\ndef generate_bal_private_data(X, y, N_parties = 10, classes_in_use = range(11), \n                              N_samples_per_class = 20, data_overlap = False):\n    \"\"\"\n    Input: \n    -- N_parties : int, number of collaboraters in this activity;\n    -- classes_in_use: array or generator, the classes of EMNIST-letters dataset \n    (0 <= y <= 25) to be used as private data; \n    -- N_sample_per_class: int, the number of private data points of each class for each party\n    \n    return: \n    \n    \"\"\"\n    priv_data = [None] * N_parties\n    combined_idx = np.array([], dtype = np.int16)\n    for cls in classes_in_use:\n        idx = np.where(y == cls)[0]\n        idx = np.random.choice(idx, N_samples_per_class * N_parties, \n                               replace = data_overlap)\n        combined_idx = np.r_[combined_idx, idx]\n        for i in range(N_parties):           \n            idx_tmp = idx[i * N_samples_per_class : (i + 1)*N_samples_per_class]\n            if priv_data[i] is None:\n                tmp = {}\n                tmp[\"X\"] = X[idx_tmp]\n                tmp[\"y\"] = y[idx_tmp]\n                tmp[\"idx\"] = idx_tmp\n                priv_data[i] = tmp\n            else:\n                priv_data[i]['idx'] = np.r_[priv_data[i][\"idx\"], idx_tmp]\n                priv_data[i][\"X\"] = np.vstack([priv_data[i][\"X\"], X[idx_tmp]])\n                priv_data[i][\"y\"] = np.r_[priv_data[i][\"y\"], y[idx_tmp]]\n                \n                \n    total_priv_data = {}\n    total_priv_data[\"idx\"] = combined_idx\n    total_priv_data[\"X\"] = X[combined_idx]\n    total_priv_data[\"y\"] = y[combined_idx]\n    return priv_data, total_priv_data\n\n\ndef generate_alignment_data(X, y, N_alignment = 3000):\n    \n    split = StratifiedShuffleSplit(n_splits=1, train_size= N_alignment)\n    if N_alignment == \"all\":\n        alignment_data = {}\n        alignment_data[\"idx\"] = np.arange(y.shape[0])\n        alignment_data[\"X\"] = X\n        alignment_data[\"y\"] = y\n        return alignment_data\n    for train_index, _ in split.split(X, y):\n        X_alignment = X[train_index]\n        y_alignment = y[train_index]\n    alignment_data = {}\n    alignment_data[\"idx\"] = train_index\n    alignment_data[\"X\"] = X_alignment\n    alignment_data[\"y\"] = y_alignment\n    \n    return alignment_data\n\ndef generate_EMNIST_writer_based_data(X, y, writer_info, N_priv_data_min = 30, \n                                      N_parties = 5, classes_in_use = range(6)):\n    \n    # mask is a boolean array of the same shape as y\n    # mask[i] = True if y[i] in classes_in_use\n    mask = None\n    mask = [y == i for i in classes_in_use]\n    mask = np.any(mask, axis = 0)\n    \n    df_tmp = None\n    df_tmp = pd.DataFrame({\"writer_ids\": writer_info, \"is_in_use\": mask})\n    #print(df_tmp.head())\n    groupped = df_tmp[df_tmp[\"is_in_use\"]].groupby(\"writer_ids\")\n    \n    # organize the input the data (X,y) by writer_ids.\n    # That is, \n    # data_by_writer is a dictionary where the keys are writer_ids,\n    # and the contents are the correcponding data. \n    # Notice that only data with labels in class_in_use are included.\n    data_by_writer = {}\n    writer_ids = []\n    for wt_id, idx in groupped.groups.items():\n        if len(idx) >= N_priv_data_min:  \n            writer_ids.append(wt_id)\n            data_by_writer[wt_id] = {\"X\": X[idx], \"y\": y[idx], \n                                     \"idx\": idx, \"writer_id\": wt_id}\n            \n    # each participant in the collaborative group is assigned data \n    # from a single writer.\n    ids_to_use = np.random.choice(writer_ids, size = N_parties, replace = False)\n    combined_idx = np.array([], dtype = np.int64)\n    private_data = []\n    for i in range(N_parties):\n        id_tmp = ids_to_use[i]\n        private_data.append(data_by_writer[id_tmp])\n        combined_idx = np.r_[combined_idx, data_by_writer[id_tmp][\"idx\"]]\n        del id_tmp\n    \n    total_priv_data = {}\n    total_priv_data[\"idx\"] = combined_idx\n    total_priv_data[\"X\"] = X[combined_idx]\n    total_priv_data[\"y\"] = y[combined_idx]\n    return private_data, total_priv_data\n\n\ndef generate_imbal_CIFAR_private_data(X, y, y_super, classes_per_party, N_parties,\n                                      samples_per_class=7):\n\n    priv_data = [None] * N_parties\n    combined_idxs = []\n    count = 0\n    for subcls_list in classes_per_party:\n        idxs_per_party = []\n        for c in subcls_list:\n            idxs = np.flatnonzero(y == c)\n            idxs = np.random.choice(idxs, samples_per_class, replace=False)\n            idxs_per_party.append(idxs)\n        idxs_per_party = np.hstack(idxs_per_party)\n        combined_idxs.append(idxs_per_party)\n        \n        dict_to_add = {}\n        dict_to_add[\"idx\"] = idxs_per_party\n        dict_to_add[\"X\"] = X[idxs_per_party]\n        #dict_to_add[\"y\"] = y[idxs_per_party]\n        #dict_to_add[\"y_super\"] = y_super[idxs_per_party]\n        dict_to_add[\"y\"] = y_super[idxs_per_party]\n        priv_data[count] = dict_to_add\n        count += 1\n    \n    combined_idxs = np.hstack(combined_idxs)\n    total_priv_data = {}\n    total_priv_data[\"idx\"] = combined_idxs\n    total_priv_data[\"X\"] = X[combined_idxs]\n    #total_priv_data[\"y\"] = y[combined_idxs]\n    #total_priv_data[\"y_super\"] = y_super[combined_idxs]\n    total_priv_data[\"y\"] = y_super[combined_idxs]\n    return priv_data, total_priv_data"
  },
  {
    "path": "utility.py",
    "content": "import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\ndef show_dataset_samples(classes, samples_per_class, \n                         images, labels, data_type=\"MNIST\"):\n    num_classes = len(classes)\n    fig, axes = plt.subplots(samples_per_class, num_classes, \n                             figsize=(num_classes, samples_per_class)\n                            )\n    \n    for col_index, cls in enumerate(classes):\n        idxs = np.flatnonzero(labels == cls)\n        idxs = np.random.choice(idxs, samples_per_class, \n                                replace=False)\n        for row_index, idx in enumerate(idxs):    \n            if data_type == \"MNIST\":\n                axes[row_index][col_index].imshow(images[idx],\n                                                  cmap = 'binary', \n                                                  interpolation=\"nearest\")\n                axes[row_index][col_index].axis(\"off\")\n            elif data_type == \"CIFAR\":\n                axes[row_index][col_index].imshow(images[idx].astype('uint8'))\n                axes[row_index][col_index].axis(\"off\")\n                \n            else:\n                print(\"Unknown Data type. Unable to plot.\")\n                return None\n            if row_index==0:\n                axes[row_index][col_index].set_title(\"Class {0}\".format(cls))\n                \n                \n    plt.show()\n    return None\n\n\n\n# def plot_history(model):\n    \n#     \"\"\"\n#     input : model is trained keras model.\n#     \"\"\"\n    \n#     fig, axes = plt.subplots(2,1, figsize = (12, 6), sharex = True)\n#     axes[0].plot(model.history.history[\"loss\"], \"b.-\", label = \"Training Loss\")\n#     axes[0].plot(model.history.history[\"val_loss\"], \"k^-\", label = \"Val Loss\")\n#     axes[0].set_xlabel(\"Epoch\")\n#     axes[0].set_ylabel(\"Loss\")\n#     axes[0].legend()\n    \n    \n#     axes[1].plot(model.history.history[\"acc\"], \"b.-\", label = \"Training Acc\")\n#     axes[1].plot(model.history.history[\"val_acc\"], \"k^-\", label = \"Val Acc\")\n#     axes[1].set_xlabel(\"Epoch\")\n#     axes[1].set_ylabel(\"Accuracy\")\n#     axes[1].legend()\n    \n#     plt.subplots_adjust(hspace=0)\n#     plt.show()\n    \n# def show_performance(model, Xtrain, ytrain, Xtest, ytest):\n#     y_pred = None\n#     print(\"CNN+fC Training Accuracy :\")\n#     y_pred = model.predict(Xtrain, verbose = 0).argmax(axis = 1)\n#     print((y_pred == ytrain).mean())\n#     print(\"CNN+fc Test Accuracy :\")\n#     y_pred = model.predict(Xtest, verbose = 0).argmax(axis = 1)\n#     print((y_pred == ytest).mean())\n#     print(\"Confusion_matrix : \")\n#     print(confusion_matrix(y_true = ytest, y_pred = y_pred))\n    \n#     del y_pred"
  }
]