Repository: microsoft/msrflute Branch: main Commit: 8bfe0854ab29 Files: 151 Total size: 775.6 KB Directory structure: gitextract_qg20kqyy/ ├── .flake8 ├── .github/ │ └── workflows/ │ ├── build_docs.yml │ └── codeql.yml ├── .gitignore ├── .gitmodules ├── CHANGELOG.md ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.TXT ├── NOTICE.txt ├── README.md ├── SECURITY.md ├── azure-pipelines.yml ├── configs/ │ ├── hello_world_mlm_bert_json.yaml │ └── hello_world_nlg_gru_json.yaml ├── core/ │ ├── __init__.py │ ├── client.py │ ├── config.py │ ├── dataloader.py │ ├── dataset.py │ ├── evaluation.py │ ├── federated.py │ ├── metrics.py │ ├── model.py │ ├── schema.py │ ├── server.py │ ├── strategies/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── dga.py │ │ ├── fedavg.py │ │ ├── fedlabels.py │ │ └── utils.py │ └── trainer.py ├── doc/ │ └── sphinx/ │ ├── Makefile │ ├── advanced.rst │ ├── class_reference.rst │ ├── conf.py │ ├── index.rst │ ├── launch.rst │ ├── make.bat │ ├── overview.rst │ ├── reference.rst │ ├── requirements.txt │ └── scenarios.rst ├── e2e_trainer.py ├── experiments/ │ ├── __init__.py │ ├── classif_cnn/ │ │ ├── .gitignore │ │ ├── README.md │ │ ├── config.yaml │ │ ├── dataloaders/ │ │ │ ├── cifar_dataset.py │ │ │ ├── dataloader.py │ │ │ └── dataset.py │ │ ├── model.py │ │ └── utils/ │ │ ├── centralized_training.py │ │ └── download_and_convert_data.py │ ├── cv/ │ │ ├── README.md │ │ ├── config.yaml │ │ ├── data.py │ │ ├── dataloaders/ │ │ │ ├── dataloader.py │ │ │ └── dataset.py │ │ ├── model.py │ │ ├── model_vgg.py │ │ └── server.py │ ├── cv_cnn_femnist/ │ │ ├── README.md │ │ ├── config.yaml │ │ ├── dataloaders/ │ │ │ ├── dataloader.py │ │ │ ├── dataset.py │ │ │ └── preprocess.py │ │ └── model.py │ ├── cv_lr_mnist/ │ │ ├── README.md │ │ ├── config.yaml │ │ ├── dataloaders/ │ │ │ ├── dataloader.py │ │ │ ├── dataset.py │ │ │ └── preprocessing.py │ │ └── model.py │ ├── cv_resnet_fedcifar100/ │ │ ├── README.md │ │ ├── config.yaml │ │ ├── dataloaders/ │ │ │ ├── dataloader.py │ │ │ ├── dataset.py │ │ │ └── preprocessing.py │ │ ├── group_normalization.py │ │ └── model.py │ ├── ecg_cnn/ │ │ ├── .gitignore │ │ ├── centralized_model.ipynb │ │ ├── config.yaml │ │ ├── dataloaders/ │ │ │ ├── dataloader.py │ │ │ └── dataset.py │ │ ├── model.py │ │ ├── readme.md │ │ └── utils/ │ │ └── preprocess.py │ ├── fednewsrec/ │ │ ├── README.md │ │ ├── config.yaml │ │ ├── dataloaders/ │ │ │ ├── dataloader.py │ │ │ ├── dataset.py │ │ │ └── preprocess_mind.py │ │ ├── fednewsrec_model.py │ │ ├── model.py │ │ └── utils.py │ ├── mlm_bert/ │ │ ├── README.md │ │ ├── config.py │ │ ├── dataloaders/ │ │ │ ├── dataloader.py │ │ │ └── dataset.py │ │ ├── model.py │ │ └── utils/ │ │ ├── trainer_pt_utils.py │ │ └── trainer_utils.py │ ├── nlg_gru/ │ │ ├── README.md │ │ ├── config.py │ │ ├── dataloaders/ │ │ │ ├── dataloader.py │ │ │ └── dataset.py │ │ ├── model.py │ │ └── utils/ │ │ └── utility.py │ ├── nlp_rnn_fedshakespeare/ │ │ ├── README.md │ │ ├── config.yaml │ │ ├── dataloaders/ │ │ │ ├── dataloader.py │ │ │ ├── dataset.py │ │ │ └── preprocessing.py │ │ └── model.py │ └── semisupervision/ │ ├── README.md │ ├── config.yaml │ ├── dataloaders/ │ │ ├── RandAugment.py │ │ ├── cifar_dataset.py │ │ ├── dataloader.py │ │ └── dataset.py │ └── model.py ├── extensions/ │ ├── RL/ │ │ └── RL.py │ ├── __init__.py │ ├── privacy/ │ │ ├── __init__.py │ │ ├── analysis.py │ │ ├── dp_kmeans.py │ │ └── metrics.py │ └── quantization/ │ └── quant.py ├── requirements.txt ├── testing/ │ ├── README.md │ ├── build_vocab.py │ ├── create_data.py │ ├── hello_world_classif_cnn.yaml │ ├── hello_world_ecg_cnn.yaml │ ├── hello_world_mlm_bert.yaml │ ├── hello_world_nlg_gru.yaml │ └── test_e2e_trainer.py └── utils/ ├── __init__.py ├── data_utils.py ├── dataloaders_utils.py ├── optimizers/ │ ├── adamW.py │ ├── lamb.py │ └── lars.py ├── preprocessing/ │ ├── create-hdf5.py │ ├── create-json.py │ └── from_json_to_hdf5.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .flake8 ================================================ [flake8] ignore = E501 ================================================ FILE: .github/workflows/build_docs.yml ================================================ name: Build docs on: push: branches: [ main ] pull_request: branches: [ main ] workflow_dispatch: jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Sphinx build uses: ammaraskar/sphinx-action@0.4 with: docs-folder: doc/sphinx/ - name: Commit documentation changes run: | git clone https://github.com/microsoft/msrflute --branch gh-pages --single-branch gh-pages cp -r doc/sphinx/_build/html/* gh-pages/ cd gh-pages git config --local user.email "action@github.com" git config --local user.name "GitHub Action" git add . git commit -m "Update documentation" -a || true - name: Push changes uses: ad-m/github-push-action@master with: branch: gh-pages directory: gh-pages github_token: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/codeql.yml ================================================ # This is based on the standard CodeQL workflow provided by Github name: "CodeQL" on: push: branches: [ "main" ] pull_request: # The branches below must be a subset of the branches above branches: [ "main" ] schedule: - cron: '35 2 * * 3' jobs: analyze: name: Analyze runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write strategy: fail-fast: false matrix: language: [ 'python' ] steps: - name: Checkout repository uses: actions/checkout@v3 - name: Set-up MPI uses: mpi4py/setup-mpi@v1 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild uses: github/codeql-action/autobuild@v2 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v2 ================================================ FILE: .gitignore ================================================ __pycache__/ .vscode/ doc/sphinx/_build testing/logs.txt testing/outputs testing/mockup ================================================ FILE: .gitmodules ================================================ [submodule "utils/dp-accountant"] path = utils/dp-accountant url = https://github.com/microsoft/prv_accountant ================================================ FILE: CHANGELOG.md ================================================ # Changelog All notable changes to this project will be documented in this file. ## [0.1.0] - 2021-11-22 We're super excited to announce FLUTE: Federated Learning Utilities for Testing and Experimentation, a platform for conducting high-performance federated learning simulations! This first release fully focuses on implementing fast prototyping to validate different CL scenarios in an Federated environment. ### Features - large scale simulation (millions of clients, sampling tens of thousands per round). - multi-GPU and multi-node orchestration backed up by MPI. - local or global differential privacy. - model quantization. - a variety of standard optimizers and aggregation methods. - most model types including CNNs, RNNs, and Huggingface Transformers. - extensibility, enabling new models, dataloaders, optimizers, and aggregators. - local or cloud-based job staging using AzureML. ## [1.0.0] - 2022-08-29 This release contain major changes in the communication backbone , in order to run previous experiments you have already integrated in FLUTE, please make sure to use `torch.distributed` instead of `MPI `to launch the jobs. For more documentation about the new command, please refer to the [README](README.md). ### New features - 🏎 Better performance: Support for NCCL and Gloo as backend communication protocols. - Improvements in GPU utilization and overall communication speed (on the order of minutes!) for projects with huge models and datasets. - 🌟 Remove file type dependency on client.py, now FLUTE can receive any kind of dataset and even download the data on-the-fly. The data intantiation is completely under control of each task dataset. - In older versions FLUTE only allowed `json` and `hdf5` files, so the client could recognize it. - 🌟 Abstract classes for new models/dataloaders. - 🌟 Allows Federated Learning with Personalization. - Personalization allows you to leverage each client local data to obtain models that are better adjusted to their own data distribution. You can run the `cv` task in order to try out this feature. ## [1.0.1] - 2023-07-29 🔋 This release removes the restriction of the minimum number of GPUs available in FLUTE, allowing users to run experiments using a single-GPU worker by instantiating both: Server and clients on the same device. For more documentation about how to run an experiments using a single GPU, please refer to the [README](README.md). ### New features - 🌟 Include FedProx aggregation method ================================================ FILE: CITATION.cff ================================================ cff-version: 1.2.0 message: "To cite Microsoft FLUTE in academic papers, please cite it as below." authors: - name: "Microsoft Research" title: "FLUTE: Federated Learning Utilities for Testing and Experimentation" version: 1.0.0 date-released: "2021-22-11" url: "https://github.com/microsoft/msrflute" license: - MIT keywords: - FLUTE - federated learning ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Microsoft Open Source Code of Conduct This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). Resources: - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. ### Pull Requests Submit pull requests to **branch contribution**. PR's in any other branch will not be accepted. When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repositories using our CLA. ================================================ FILE: LICENSE.TXT ================================================ Copyright (c) Microsoft Corporation. MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: NOTICE.txt ================================================ THIRD-PARTY SOFTWARE NOTICES AND INFORMATION Do Not Translate or Localize This software incorporates components from the projects listed below. The original copyright notices and the licenses under which Microsoft received such components are set forth below and are provided for informational purposes only. Microsoft reserves all rights not expressly granted herein, whether by implication, estoppel or otherwise. This software includes parts of the Huggingface/Transformers Library (https://github.com/huggingface/transformers). State-of-the-art of Natural Language Processing for Jax, PyTorch and TensorFlow. Huggingface/Transformers library is licensed under Apache License 2.0, you can find a copy of this license at https://github.com/huggingface/transformers/blob/master/LICENSE This software includes parts of the Tensorflow/Privacy Library (https://github.com/tensorflow/privacy). A library that includes implementations of TensorFlow optimizers for training machine learning models with differential privacy. The Tensorflow/Privacy library is licensed under Apache License 2.0, you can find a copy of this license at https://github.com/tensorflow/privacy/blob/master/LICENSE This software includes parts of LEAF Library (https://github.com/TalwalkarLab/leaf). A Benchmark for Federated Settings. LEAF library is licensed under BSD 2-Clause License, you can find a copy of this license at https://github.com/TalwalkarLab/leaf/blob/master/LICENSE.md This software includes parts of ECG Classification from Kaggle Competition (https://www.kaggle.com/polomarco/ecg-classification-cnn-lstm-attention-mechanism). An example for ECG Classification | CNN LSTM Attention Mechanism. This example is licensed under Apache License 2.0, you can find a copy of this license at https://www.apache.org/licenses/LICENSE-2.0 This software includes parts of Torchvision Library (https://github.com/pytorch/vision.git). A package of popular datasets, model architectures, and common image transformations for computer vision. This example is licenced under BSD 3-Clause License, you can find a copy of this licence at https://github.com/pytorch/vision/blob/main/LICENSE This software includes parts of FedML Library (https://github.com/FedML-AI/FedML).The Community Building Open and Collaborative AI Anywhere at Any Scale. FedML library is licensed under Apache License 2.0, you can find a copy of this license at https://github.com/FedML-AI/FedML/blob/master/LICENSE This software includes parts of FedNewsRec-EMNLP-Findings-2020 repository (https://github.com/taoqi98/FedNewsRec). Code from the paper "Privacy-Preserving News Recommendation Model Learning". This example is licenced under MIT License, you can find a copy of this licence at https://github.com/taoqi98/FedNewsRec/blob/master/LICENSE This software includes parts of Fast AutoAugment repository (https://github.com/kakaobrain/fast-autoaugment). Code from the paper "Fast AutoAugment" (Accepted at NeurIPS 2019). This example is licenced under MIT License, you can find a copy of this licence at https://github.com/kakaobrain/fast-autoaugment/blob/master/LICENSE This software includes parts of NIID-Bench repository (https://github.com/Xtra-Computing/NIID-Bench). Code from the paper "Federated Learning on Non-IID Data Silos: An Experimental Study". This example is licenced under MIT License, you can find a copy of this licence at https://github.com/Xtra-Computing/NIID-Bench/blob/main/LICENSE ================================================ FILE: README.md ================================================ # FLUTE Welcome to FLUTE (Federated Learning Utilities for Testing and Experimentation), a platform for conducting high-performance federated learning simulations. ## Features FLUTE is a pytorch-based orchestration environment enabling GPU or CPU-based FL simulations. The primary goal of FLUTE is to enable researchers to rapidly prototype and validate their ideas. Features include: - large scale simulation (millions of clients, sampling tens of thousands per round) - single/multi GPU and multi-node orchestration - local or global differential privacy - model quantization - a variety of standard optimizers and aggregation methods - most model types including CNNs, RNNs, and Huggingface Transformers. - extensibility, enabling new models, dataloaders, optimizers, and aggregators. - local or cloud-based job staging using AzureML ## Benchmarking The following common tasks were used to evaluate the performance in speed/memory utilization of FLUTE compared with the most representative simulation platforms based on their number of starts on GitHub: FedML 0.7.303 and Flower 1.0.0. |Task|Data Set|Model|Algorithm|# Clients|Clients per round|Batch Size|Client Optimizer|lr|Epochs|# Rounds|Test Freq| |:----|:----|:----|:----|:----|:----|:----|:----|:----|:----|:----|:----| |CV|MNIST|LR|FedAvg|1000|10|10|SGD|0.03|1|100|20| |CV|Federated EMNIST|CNN (2 Conv + 2 FC)|FedAvg|3400|10|20|SGD|0.1|1|1500|50| |CV|FED_CIFAR-100|ResNet-18+group normalization|FedAvg|500|10|20|SGD|0.1|1|4000|50| |NLP|Shakespeare|RNN (2 LSTM + 1 FC)|FedAvg|715|10|4|SGD|0.8|1|1200|50| ### FedML Comparison This comparison was carried out using Parrot (Simulator) on version 0.7.303 at commit ID [8f7f261f](https://github.com/FedML-AI/FedML/tree/8f7f261f44e58d0cb5a416b0d6fa270b42a91049). Showing that in some cases FLUTE can outperform 43x faster. ``` _____________________________________________________________________________ | | FedML (MPI) - Fastest | FLUTE (NCCL) - Fastest | | Task | Acc | Time | GPU Mem | Acc | Time | GPU Mem | |--------------------|-----|----------|----------|-----|----------|-----------| | LR_MNIST | ~81 | 00:03:09 | ~3060 MB | ~81 | 00:01:35 | ~1060 MB | | CNN_FEMNIST | ~83 | 05:49:52 | ~5180 MB | ~83 | 00:08:22 | ~1770 MB | | RESNET_FEDCIFAR100 | ~34 | 15:55:36 | ~5530 MB | ~33 | 01:42:01 | ~1900 MB | | RNN_FEDSHAKESPEARE | ~57 | 06:46:21 | ~3690 MB | ~57 | 00:21:50 | ~1270 MB | ----------------------------------------------------------------------------- ``` You can find the examples above in [experiments](experiments). ### Flower Comparison This comparison was carried out using Flower (Simulator) on version 1.0.0 at commit ID [4e7fad9](https://github.com/adap/flower/tree/4e7fad99389a5ee511730841b61f279e3359cb16) with the [lr_mnist](experiments/cv_lr_mnist/) task. Showing that in some cases FLUTE can outperform 53x faster. ``` ________________________________________________ | | Flower (Ray) | FLUTE (NCCL/Gloo) | | | Acc | Time | Acc | Time | |--------|-----|-------------|-----|-------------| | CPU | ~80 | 00:30:14 | ~80 | 00:03:20 | | GPU 2x | ~80 | 01:21:44 | ~80 | 00:01:31 | | GPU 4x | ~79 | 00:56:45 | ~81 | 00:01:26 | ------------------------------------------------ ``` You can find the example above in the [cv_lr_mnist](experiments/cv_lr_mnist/) folder. ## Quick Start Install the requirements stated inside of `requirements.txt`. Ideally this sould be done inside of a virtual environment, for instance, using Anaconda. ``` conda create -n FLUTE python==3.7 pip install -r requirements.txt ``` FLUTE uses torch.distributed API as its main communication backbone, supporting three built-in backends. For more information please refer to [Distributed Communication Package](https://pytorch.org/docs/stable/distributed.html). Therefore, we highly suggest to use NCCL backend for distributed GPU training and Gloo for distributed CPU training. There is no `setup.py` as FLUTE is not currently distributed as a package, but instead meant to run from the root of the repository. After this initial setup, you can use the data created for the integration test for a first local run. Note that this data needs to be download manually inside the `testing` folder, for more instructions please look at [the README file inside `testing`](testing/README.md). For single-GPU runs: ``` python -m torch.distributed.run --nproc_per_node=1 e2e_trainer.py -dataPath ./testing -outputPath scratch -config testing/hello_world_nlg_gru.yaml -task nlg_gru -backend nccl ``` For multi-GPU runs (3 GPUs): ``` python -m torch.distributed.run --nproc_per_node=3 e2e_trainer.py -dataPath ./testing -outputPath scratch -config testing/hello_world_nlg_gru.yaml -task nlg_gru -backend nccl ``` The config file `testing/hello_world_nlg_gru.yaml` has some comments explaining the major sections and some important details; essentially, it consists in a very short experiment where a couple of iterations are done for just a few clients. A `scratch` folder will be created containing detailed logs. ## Documentation Online documentation is available at https://microsoft.github.io/msrflute/ Locally, the documentation is inside the `doc/sphinx` folder. To build the docs on Linux: ``` $ pip install sphinx $ cd doc/sphinx $ make html ``` On Windows, you can use the `make.bat` script. It may be necessary to `export PYTHONPATH=../../` for sphinx to find the code. ## Architecture The core client/server training code is inside the `core` folder. - Server-side federation and global DP application takes place in `server.py`, more specifically in the `OptimizationServer.train()` method. - Client-side training updates take place in the static method `Client.process_round()`, inside `client.py`. General FL orchestration code is in `federated.py`, but for most hub and spoke federation scenarios you won't need to touch this (unless you want to invest in optimizing server-client calls, which would be great!). Note that FLUTE does not implement secure aggregation since this is primarily a security feature for production scenarios; contributors are invited to add it for experimentation purposes. The primary entry point for an experiment is in the script `e2e_trainer.py`. Primary config scripts for experiments are in `configs`. For instance, a basic training scenario for a next-word prediction task is set up in `hello_world_nlg_gru_json.yaml`. Privacy accounting is expensive so the main parameters are logged and the actual accounting can be done offline. RDP privacy accounting is in `extensions/privacy/analysis.py`. A better accounting method is in the `dp-accountant` submodule. ## Customization See `experiments` folder for illustrations of how dataloaders and models are customized. In order to in include a new experiment, the new scenario must be added following the same folder structure as `nlg_gru` and `mlm_bert`, naming the folder with the task. ## Experiments Experiments are defined by YAML files, examples are provided in the `configs` folder. These can be run either locally or on AzureML. For running experiments on AzureML, the CLI can help. You should first [install the CLI](https://docs.microsoft.com/en-us/azure/machine-learning/reference-azure-machine-learning-cli) (make sure you have v2) and [create a resource group and workspace](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace-cli?tabs=createnewresources%2Cvnetpleconfigurationsv1cli). You can then create a compute cluster, type `az ml compute create -h` for more info. Afterwards, you should write an YAML file with instructions for the job; we provide a simple example below ```yaml experiment_name: basic_example description: Basic example of AML config for submitting FLUTE jobs code: local_path: . compute: azureml:Test environment: image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel inputs: data: folder: azureml://datastores/data/paths/cifar mode: rw_mount command: > apt -y update && apt -y install openmpi-bin libopenmpi-dev openssh-client && python3 -m pip install --upgrade pip && python3 -m pip install -r requirements.txt && python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -outputPath=./outputs -dataPath={inputs.data} -task=classif_cnn -config=./experiments/classif_cnn/config.yaml -backend=nccl ``` You should replace `compute` with the name of the one you created before, and adjust the path of the datastore containing the data -- in the example above, we created a datastore called `data` and added to it a folder called `cifar`, which contained the two HDF5 files. The command passed above will install dependencies and then launch a distributed job with 4 threads, for the experiment defined in `experiments/classif_cnn`. Details on how to run a job using the AzureML CLI are given [in its documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli), but typically it suffices to set up the environment and type `az ml job create -f `. In the same page of the documentation, you can also find more info about how to set up the YAML file above, in case other changes are needed. Note that the `local_path` above is relative to the location of the YAML file, so setting it to `.` assumes it is in the same folder as `e2e_trainer.py`. All files on this folder will be uploaded to Azure, including hidden folders such as `.git`, so make sure to temporarily get rid of large files and folders that are not needed. After launching the experiment, you can follow it on AzureML Studio, which prints logs, plots metrics and makes the output easily available after the experiment is finished. ## Privacy Accounting Accounting is expensive, so we log all the privacy parameters so that accounting can be run offline. Best run on a Linux box with a GPU. In particular, we use a DP accountant from another Microsoft repository, which is included in ours as a submodule. For using this accountant, just follow the instructions below: ``` $ git submodule update --init --recursive $ cd utils $ cd dp-accountant $ python setup.py install $ ./bin/compute-dp-epsilon --help usage: compute-dp-epsilon [-h] -p SAMPLING_PROBABILITY -s NOISE_MULTIPLIER -i ITERATIONS -d DELTA ``` ## Third Party Notice This software includes the files listed below from the Huggingface/Transformers Library (https://github.com/huggingface/transformers) as part of task performance and preprocessing pretrained models. experiments/mlm_bert └── utils ├── trainer_pt_utils.py └── trainer_utils.py This software includes the file extensions/privacy/analysis.py from the Tensorflow/Privacy Library (https://github.com/tensorflow/privacy) as part of Renyi Differential Privacy implementation. This software includes the script testing/build_vocab.py from LEAF Library (https://github.com/TalwalkarLab/leaf) to create the vocabulary needed to run a testing job. This software includes the model implementation of the example ECG Classification | CNN LSTM Attention Mechanism from Kaggle Competition (https://www.kaggle.com/polomarco/ecg-classification-cnn-lstm-attention-mechanism) to reproduce the [ecg_cnn](experiments/ecg_cnn/model.py) experiment. This software includes the model implementation of the FedNewsRec repository (https://github.com/taoqi98/FedNewsRec)| Code from the paper "Privacy-Preserving News Recommendation Model Learning" (https://arxiv.org/abs/2003.09592) ported to PyTorch framework to reproduce the [fednewsrec](experiments/fednewsrec/model.py) experiment. For more information about third-party OSS licence, please refer to [NOTICE.txt](NOTICE.txt). This software includes the Data Augmentation scripts of the Fast AutoAugment repository (https://github.com/kakaobrain/fast-autoaugment) to preprocess the data used in the [semisupervision](experiments/semisupervision/dataloaders/cifar_dataset.py) experiment. This software included the FedProx logic implementation of the NIID-Bench repository (https://github.com/Xtra-Computing/NIID-Bench/tree/main) as Federated aggregation method used in the [trainer](core/trainer.py) object. ## Support You are welcome to open issues on this repository related to bug reports and feature requests. ## Contributing Contributions are welcomed and encouraged. For details on how to contribute, please see [CONTRIBUTING.md](CONTRIBUTING.md). ================================================ FILE: SECURITY.md ================================================ ## Security Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. ## Reporting Security Issues **Please do not report security vulnerabilities through public GitHub issues.** Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) * Full paths of source file(s) related to the manifestation of the issue * The location of the affected source code (tag/branch/commit or direct URL) * Any special configuration required to reproduce the issue * Step-by-step instructions to reproduce the issue * Proof-of-concept or exploit code (if possible) * Impact of the issue, including how an attacker might exploit the issue This information will help us triage your report more quickly. If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. ## Preferred Languages We prefer all communications to be in English. ## Policy Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). ================================================ FILE: azure-pipelines.yml ================================================ trigger: - main pool: vmImage: 'windows-latest' steps: - task: CredScan@2 inputs: toolMajorVersion: 'V2' - task: Semmle@1 env: SYSTEM_ACCESSTOKEN: $(System.AccessToken) inputs: sourceCodeDirectory: '$(Build.SourcesDirectory)' language: 'python' querySuite: 'Recommended' timeout: '1800' ram: '16384' addProjectDirToScanningExclusionList: true - task: ComponentGovernanceComponentDetection@0 inputs: scanType: 'Register' verbosity: 'Verbose' alertWarningLevel: 'High' - task: PublishSecurityAnalysisLogs@2 inputs: ArtifactName: 'CodeAnalysisLogs' ArtifactType: 'Container' AllTools: true ToolLogsNotFoundAction: 'Standard' ================================================ FILE: configs/hello_world_mlm_bert_json.yaml ================================================ # Basic configuration file for running mlm_bert example using json files. # Parameters needed to initialize the model model_config: model_type: BERT model_folder: experiments/mlm_bert/model.py BERT: loader_type: text model: model_name: roberta-large cache_dir: ./cache_dir use_fast_tokenizer: False mask_token: task: mlm past_index: -1 prediction_loss_only: false process_line_by_line: false training: seed: 12345 label_smoothing_factor: 0 batch_size: 64 max_seq_length: 256 # Configuration for differential privacy dp_config: enable_local_dp: false # If enabled, the rest of parameters is needed. enable_global_dp: false # Local dp clips and adds noise on the client and centrally accumulates the privacy budget eps: 100 # epsilon global_sigma: 0.35 # Used when global dp es enabled, specifies the global Gaussian noise weight_scaler: 0.0001 # indicates how the aggregation weights scaled before noise addition, and unscaled afterwards. max_grad: 0.008 # max gradient max_weight: 0.5 # The max_weight and min_weight should be already scaled by weight_scaler min_weight: 0.0000001 # Because we scale down the weight using weight_scalar -> clip -> add noise -> scale back up. # Additional privacy metrics privacy_metrics_config: apply_metrics: false # If enabled, the rest of parameters is needed. # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx) strategy: DGA # Determines all the server-side settings for training and evaluation rounds server_config: resume_from_checkpoint: true # Resumes from latest checkpoint iteration if available do_profiling: false # Capture profiling information during server updates. fast_aggregation: true wantRL: false # Enable/Disable Reinforcement learning RL: # Reinforcement Learning parameters RL_path_global: false marginal_update_RL: true RL_path: ./RL_models model_descriptor_RL: marginalUpdate network_params: 300,128,128,128,64,100 initial_epsilon: 0.5 final_epsilon: 0.0001 epsilon_gamma: 0.90 max_replay_memory_size: 1000 minibatch_size: 16 gamma: 0.99 optimizer_config: lr: 0.0003 type: adam amsgrad: true annealing_config: type: step_lr step_interval: epoch step_size: 1 gamma: 0.95 optimizer_config: # Configuration for server-side optimizer lr: 0.00001 weight_decay: 0.01 type: adamW annealing_config: # This section configures how the learning rate decays type: step_lr step_interval: epoch gamma: 1.0 step_size: 1000 val_freq: 4 # Frequency for validation rounds rec_freq: 16 # Frequency for testing rounds initial_val : true # Enable initial validation round at itr=0 initial_rec: false # Enable initial testing round at itr=0 max_iteration: 10000 # Total number of rounds for FL num_clients_per_iteration: 200 # Number of clients sampled per round data_config: # Server-side data configuration val: # Validation data val_data: task: mlm mlm_probability: 0.25 tokenizer_type_fast: False batch_size: 128 max_seq_length: 256 min_words_per_utt: 5 max_samples_per_user: 5000 mask_token: num_workers: 0 prepend_datapath: false cache_dir: ./cache_dir # Note this is NOT the main training data configuration, which is configured in the # client config. This section is ignored unless you are running replay data. # If you want to run replay data- set a path name for train_data_server. # train: # loader_type: text # train_data: null # train_data_server: null # desired_max_samples: null test: # Test data configuration test_data: task: mlm mlm_probability: 0.25 tokenizer_type_fast: False batch_size: 128 max_seq_length: 256 max_samples_per_user: 5000 mask_token: num_workers: 0 prepend_datapath: false cache_dir: ./cache_dir type: model_optimization # Server type aggregate_median: softmax # FL aggregation method weight_train_loss: mag_mean_loss # Determines how each client's weight is computed (e.g. grad_mean_loss, train_loss) softmax_beta: 1.00 initial_lr_client: 0.00001 lr_decay_factor: 1.0 best_model_criterion: loss # Determine the best model based on minimal loss, for checkpointing fall_back_to_best_model: false # If a model degrades, use the previous best model # server_replay_config: # This is only applies if the server-side training data is fully configured and loaded # server_iterations: 50 # optimizer_config: # lr: 0.00002 # amsgrad: true # type: adam # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: meta_learning: basic stats_on_smooth_grad: true ignore_subtask: false copying_train_data: false do_profiling: false # Enables client-side training profiling data_config: train: # This is the main training data configuration list_of_train_data: task: mlm mlm_probability: 0.25 tokenizer_type_fast: False batch_size: 24 max_seq_length: 256 min_words_per_utt: 5 desired_max_samples: 5000 mask_token: num_workers: 0 num_frames: 0 max_grad_norm: 15.0 prepend_datapath: false cache_dir: ./cache_dir pin_memory: true type: optimization meta_optimizer_config: lr: 0.01 type: adam optimizer_config: type: adamW weight_decay: 0.01 amsgrad: true annealing_config: type: step_lr step_interval: epoch step_size: 2 gamma: 1.0 ================================================ FILE: configs/hello_world_nlg_gru_json.yaml ================================================ # Basic configuration file for running nlg_gru example using json files. # Parameters needed to initialize the model model_config: model_type: GRU model_folder: experiments/nlg_gru/model.py pretrained_model_path: embed_dim: 160 vocab_size: 10000 hidden_dim: 512 OOV_correct: false # Configuration for differential privacy dp_config: enable_local_dp: false # If enabled, the rest of parameters is needed. # enable_local_dp: true # Local dp clips and adds noise on the client and centrally accumulates the privacy budget # eps: 100 # epsilon # max_grad: 0.008 # max gradient # weight_scaler: 0.0001 # indicates how the aggregation weights scaled before noise addition, and unscaled afterwards. # max_weight: 0.0001 # The max_weight and min_weight should be already scaled by weight_scaler # min_weight: 0.00009 # Because we scale down the weight using weight_scalar -> clip -> add noise -> scale back up. # Additional privacy metrics privacy_metrics_config: apply_metrics: false # If enabled, the rest of parameters is needed. # apply_indices_extraction: true # If we extract word indices we want to consider the rank of the words extracted. # allowed_word_rank: 9000 # Any word that rank above this value is considered privacy risk # apply_leakage_metric: true # max_leakage: 30 # max_allowed_leakage: 3 # adaptive_leakage_threshold: 0.95 # Takes the 95th percentile of the leakage for the next round. # is_leakage_weighted: true # attacker_optimizer_config: # lr: 0.03 # type: adamax # amsgrad: false # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx) strategy: FedProx # Determines all the server-side settings for training and evaluation rounds server_config: wantRL: false # Enable/Disable Reinforcement learning resume_from_checkpoint: true # Resumes from latest checkpoint iteration if available do_profiling: false # Capture profiling information during server updates. optimizer_config: # Configuration for server-side optimizer type: lamb lr: 0.1 weight_decay: 0.005 annealing_config: # This section configures how the learning rate decays type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 2 # Frequency for validation rounds rec_freq: 4 # Frequency for testing rounds initial_val : true # Enable initial validation round at itr=0 initial_rec: false # Enable initial testing round at itr=0 max_iteration: 11 # Total number of rounds for FL num_clients_per_iteration: 10 # Number of clients sampled per round data_config: # Server-side data configuration val: # Validation data batch_size: 2048 tokenizer_type: not_applicable prepend_datapath: false val_data: # Path for validation data vocab_dict: # Path for vocabulary pin_memory: true num_workers: 0 # Indicates how many workers are used for creating batches num_frames: 2400 max_batch_size: 2048 max_num_words: 25 unsorted_batch: true # Note this is NOT the main training data configuration, which is configured in the # client config. This section is ignored unless you are running replay data. # If you want to run replay data- set a path name for train_data_server. # train: # batch_size: 128 # loader_type: text # tokenizer_type: not_applicable # prepend_datapath: false # train_data: null # train_data_server: null # vocab_dict: # pin_memory: true # num_workers: 0 # num_frames: 2400 # desired_max_samples: 500 # max_grad_norm: 10.0 # max_batch_size: 128 # max_num_words: 25 # unsorted_batch: true test: # Test data configuration batch_size: 2048 tokenizer_type: not_applicable prepend_datapath: false train_data: null train_data_server: null test_data: # Path for validation data vocab_dict: # Path for vocabulary pin_memory: true num_workers: 0 # Indicates how many workers are used for creating batches max_batch_size: 2048 max_num_words: 25 unsorted_batch: true type: model_optimization aggregate_median: softmax # FL aggregation method weight_train_loss: train_loss # Determines how each client's weight is computed (e.g. grad_mean_loss, train_loss) softmax_beta: 20.0 initial_lr_client: 1.0 lr_decay_factor: 1.0 best_model_criterion: loss # Determine the best model based on minimal loss, for checkpointing fall_back_to_best_model: false # If a model degrades, use the previous best model # server_replay_config: # This is only applies if the server-side training data is fully configured and loaded # server_iterations: 50 # optimizer_config: # type: adam # lr: 0.00002 # amsgrad: true # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: mu: 0.001 # Used only for FedProx aggregation method meta_learning: basic stats_on_smooth_grad: true ignore_subtask: false num_skips_threshold: 10 copying_train_data: false do_profiling: false # Enables client-side training profiling data_config: train: # This is the main training data configuration batch_size: 64 tokenizer_type: not_applicable prepend_datapath: false list_of_train_data: # Path to training data vocab_dict: # Path to vocabulary pin_memory: true num_workers: 0 desired_max_samples: 50000 max_grad_norm: 20.0 max_batch_size: 128 max_num_words: 25 unsorted_batch: true type: optimization meta_optimizer_config: lr: 1.0 type: sgd optimizer_config: type: sgd annealing_config: type: step_lr step_interval: epoch step_size: 1 gamma: 1.0 ================================================ FILE: core/__init__.py ================================================ ================================================ FILE: core/client.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. ''' The Client object is short-lived, instantiated inside workers 1 to N for processing a given client's data. It's main method is the `process_round` function, used to update the model given a client's data. ''' import copy import logging import os import time from easydict import EasyDict as edict from importlib.machinery import SourceFileLoader import numpy as np import torch # Internal imports import core.federated as federated from .strategies import select_strategy from .trainer import ( Trainer, run_validation_generic, set_component_wise_lr, ) from utils import ( ScheduledSamplingScheduler, make_optimizer, print_rank, to_device, convex_inference, alpha_update, ) from utils.dataloaders_utils import ( make_train_dataloader, make_val_dataloader, make_test_dataloader, get_dataset, ) import extensions.privacy from extensions.privacy import metrics as privacy_metrics from experiments import make_model global train_dataset global trainset_unlab global trainset_unlab_rand class Client: # It's unclear why, but sphinx refuses to generate method docs # if there is no docstring for this class. """Client class for specifying individual client training tasks""" def __init__(self, client_id, config, send_gradients): ''' Client side processing: computing gradients, update the model and send them back to the server Args: client_id (int): identifier for grabbing that client's data. config (dict): dictionary with parameters loaded from config file. send_gradients (bool): if True, model gradients are sent back; otherwise, model weights are sent back. ''' super().__init__() self.client_id = client_id self.config = copy.deepcopy(config) self.send_gradients = send_gradients def get_client_data(self, dataset=None): '''"Getter" method that returns all object's attributes at once.''' client_data = self.get_data(self.client_id, dataset) return self.client_id, client_data, self.config, self.send_gradients @staticmethod def get_train_dataset(data_path, client_train_config, task): '''This function will obtain the dataset for all training users. Args: data_path (str): path to file containing taining data. client_train_config (dict): trainig data config. task (str): task name. ''' global train_dataset global trainset_unlab global trainset_unlab_rand train_dataset = get_dataset(data_path, client_train_config, task, mode="train") if task == 'semisupervision': trainset_unlab = get_dataset(data_path, client_train_config, task, mode="train", user_idx = -2) trainset_unlab_rand = get_dataset(data_path, client_train_config, task, mode="train", user_idx = -3) else: trainset_unlab = None trainset_unlab_rand = None return len(train_dataset.user_list) @staticmethod def get_data(clients, dataset): ''' Create training dictionary''' if dataset == None: # Training case datasets = [train_dataset, trainset_unlab, trainset_unlab_rand] if trainset_unlab != None else [train_dataset] else: # Evaluation case datasets = [dataset] data_with_labels = hasattr(datasets[0],"user_data_label") strcts = [] # Returning list length will always be 1 except when the task is semisupervision for dataset in datasets: input_strct = {'users': [], 'num_samples': [],'user_data': dict(), 'user_data_label': dict()} if data_with_labels else {'users': [], 'num_samples': [],'user_data': dict()} for client in clients: user = dataset.user_list[client] input_strct['users'].append(user) input_strct['num_samples'].append(dataset.num_samples[client]) input_strct['user_data'][user]= dataset.user_data[user] if data_with_labels: input_strct['user_data_label'][user] = dataset.user_data_label[user] strcts.append(edict(input_strct)) return strcts @staticmethod def run_testvalidate(client_data, server_data, mode, model): '''Called by worker to run test/validation sample on a client. This functions assumes set_model_for_round has already been called to push the model to the client (see federated.py). Args: client_data (tuple): client data and config. It is a tuple with 3 components; importantly, the second component is a dict containing the data, and the third component is a dict with the config parsed from the YAML file. server_data (tuple): server data (model parameters mostly). It is a tuple with 2 components; importantly, the second component consists of the current model parameters. mode (str): whether to `test` or `validate`. model (torch.nn.Module): actual model without parameters. ''' # Process inputs and initialize variables _, data_strcts, config, _ = client_data _, model_parameters, iteration = server_data config = copy.deepcopy(config) model_path = config["model_path"] begin = time.time() # Use the server's data config since we're distributing test/validate from the server data_strct = data_strcts[0] data_config = config['server_config']['data_config'][mode] want_logits = data_config.get('wantLogits', False) send_dicts = config['server_config'].get('send_dicts', False) # Create dataloader dataloader = None print_rank('making dataloader with task {}'.format(config['server_config']['task']), loglevel=logging.DEBUG) if mode == 'test': dataloader = make_test_dataloader(data_config, data_path=None, task=config['server_config']['task'], data_strct=data_strct) elif mode == 'val': dataloader = make_val_dataloader(data_config, data_path=None, task=config['server_config']['task'], data_strct=data_strct) # Set model parameters n_layers, n_params = len([f for f in model.parameters()]), len(model_parameters) print_rank(f'Copying model parameters... {n_layers}/{n_params}', loglevel=logging.DEBUG) model = to_device(model) if send_dicts: # Send model state dictionary tmp = {} for param_key, param_dict in zip (model.state_dict(), model_parameters): tmp[param_key] = param_dict model.load_state_dict(tmp) else: # Send parameters for p, data in zip(model.parameters(), model_parameters): p.data = data.detach().clone().cuda() if torch.cuda.is_available() else data.detach().clone() print_rank(f'Model setup complete. {time.time() - begin}s elapsed.', loglevel=logging.DEBUG) # Compute output and metrics on the test or validation data num_instances = sum(data_strct['num_samples']) print_rank(f'Validating {num_instances}', loglevel=logging.DEBUG) output, metrics = run_validation_generic(model, dataloader) # Load local model if necessary if config['server_config']['type']=='personalization': local_model = make_model(config['model_config']) user = data_strct['users'][0] local_model_name = os.path.join(model_path, user + '_model.tar') if os.path.exists(local_model_name): print_rank('Loading Local Model .. {}'.format(local_model_name)) checkpoint = torch.load(local_model_name) local_model.load_state_dict(checkpoint["model_state_dict"]) local_alpha_name = os.path.join(model_path, user + '_alpha') if os.path.exists(local_alpha_name): alpha = torch.load(local_alpha_name) print_rank('Loading Alpha Weight from {}: Value={}'.format(local_model_name, alpha)) # Run inference and get logits back if mode == 'test': dataloader = make_test_dataloader(data_config, data_path=None, task=config['server_config']['task'], data_strct=data_strct) elif mode == 'val': dataloader = make_val_dataloader(data_config, data_path=None, task=config['server_config']['task'], data_strct=data_strct) output_local, local_metrics = run_validation_generic(local_model, dataloader) loss_local = local_metrics['loss']['value'] cer = local_metrics['acc']['value'] # Combine logits cer =convex_inference(output, output_local, alpha=alpha) metrics['loss']['value'] = (metrics['loss']['value'] + loss_local) / 2 metrics['acc']['value'] = cer output = None if not want_logits else output return output, metrics, num_instances @staticmethod def process_round(client_data, server_data, model, data_path, eps=1e-7): '''Compute gradients given client's data and update model. Args: client_data (tuple): client data and config. It is a tuple consisting of 4 components: an int indicating the client's id, a dict containing that client's data, a dict with the config parsed from the YAML file, and a bool indicating whether or not gradients should be sent. server_data (tuple): server data (model parameters mostly). It is a tuple consisting of 2 components; importantly, the first is a float giving the client's learning rate, and the second a list of torch.Tensor's with current model parameters. model (torch.nn.Module): actual model without parameters. data_path (str): where to get data from. eps (float): lower bound for aggregation weights. ''' # Ensure the client is assigned to the correct GPU if torch.cuda.is_available() and torch.cuda.device_count() == federated.size(): torch.cuda.set_device(federated.local_rank()) # Process inputs and initialize variables client_id, data_strcts, config, send_gradients = client_data initial_lr, model_parameters, iteration = server_data config = copy.deepcopy(config) model_config = config['model_config'] client_config = config['client_config'] data_config = client_config['data_config']['train'] semisupervision_config = client_config.get('semisupervision',None) task = client_config.get('task', {}) trainer_config = client_config.get('trainer_config', {}) privacy_metrics_config = config.get('privacy_metrics_config', None) model_path = config["model_path"] strategy_algo = config['strategy'] StrategyClass = select_strategy(strategy_algo) strategy = StrategyClass('client', config) print_rank(f'Client successfully instantiated strategy {strategy}', loglevel=logging.DEBUG) send_dicts = config['server_config'].get('send_dicts', False) begin = time.time() client_stats = {} data_strct = data_strcts[0] user = data_strct['users'][0] print_rank('Loading : {}-th client with name: {}, {} samples, {}s elapsed'.format( client_id[0], user, data_strct['num_samples'][0], time.time() - begin), loglevel=logging.INFO) # Get dataloaders train_dataloader = make_train_dataloader(data_config, data_path, task=task, clientx=0, data_strct=data_strct) # Instantiate the model object if model is None: model = make_model( model_config, dataloader_type=train_dataloader.__class__.__name__, input_dim=data_config['input_dim'], vocab_size=train_dataloader.vocab_size, ) # Set model parameters n_layers, n_params = len([f for f in model.parameters()]), len(model_parameters) print_rank(f'Copying model parameters... {n_layers}/{n_params}', loglevel=logging.DEBUG) model = to_device(model) if send_dicts: # Send model state dictionary tmp = {} for param_key, param_dict in zip (model.state_dict(), model_parameters): tmp[param_key] = param_dict model.load_state_dict(tmp) else: # Send parameters for p, data in zip(model.parameters(), model_parameters): p.data = data.detach().clone().cuda() if torch.cuda.is_available() else data.detach().clone() print_rank(f'Model setup complete. {time.time() - begin}s elapsed.', loglevel=logging.DEBUG) # Fix parameters of layers if 'updatable_names' in trainer_config: set_component_wise_lr(model, client_config['optimizer_config'], trainer_config['updatable_names']) # Create the optimizer on the workers # NOTE: the server dictates the learning rate for the clients client_config['optimizer_config']['lr'] = initial_lr optimizer = make_optimizer(client_config['optimizer_config'], model) # Make the scheduled sampling scheduler ss_scheduler = None if 'ss_config' in client_config and client_config['ss_config'] is not None: ss_scheduler = ScheduledSamplingScheduler(model=model, **client_config['ss_config']) # Make the trainer trainer = Trainer( model=model, optimizer=optimizer, ss_scheduler=ss_scheduler, train_dataloader=train_dataloader, server_replay_config =client_config, max_grad_norm=client_config['data_config']['train'].get('max_grad_norm', None), anneal_config=client_config['annealing_config'] if 'annealing_config' in client_config else None, num_skips_threshold=client_config['num_skips_threshold'] if 'num_skips_threshold' in client_config else -1, ignore_subtask=client_config['ignore_subtask'] ) if trainer.optimizer is not None: initial_optimizer_state = copy.deepcopy(trainer.optimizer.state_dict()) annealing_config = client_config['annealing_config'] if 'annealing_config' in client_config else None assert 'desired_max_samples' in client_config['data_config']['train'], 'Missing \'desired_max_samples\' entry in data config parameter' desired_max_samples = client_config['data_config']['train']['desired_max_samples'] if trainer.optimizer is not None: # reset the optimizer state if initial_lr > 0: trainer.optimizer.param_groups[0].update({'lr': initial_lr}) initial_optimizer_state = copy.deepcopy(trainer.optimizer.state_dict()) trainer.reset_optimizer(initial_optimizer_state, annealing_config) # Mark the end of setup end = time.time() client_stats['setup'] = end - begin print_rank(f'Client setup cost {client_stats["setup"]}s', loglevel=logging.DEBUG) begin_training = end # Training begins here trainer.model.train() trainer.model.zero_grad() # Save the client batches if we want to evaluate the privacy metrics apply_privacy_metrics = (False if privacy_metrics_config is None else privacy_metrics_config['apply_metrics']) # This is where training actually happens algo_payload = None if strategy_algo == 'FedLabels': datasets =[get_dataset(data_path, config, task, mode="train", test_only=False, data_strct=data_strcts[i], user_idx=0) for i in range(3)] algo_payload = {'strategy':'FedLabels', 'data': datasets, 'iter': iteration, 'config': semisupervision_config} elif strategy_algo == 'FedProx': algo_payload = {'strategy':'FedProx', 'mu': client_config.get('mu',0.001)} train_loss, num_samples, algo_computation = trainer.train_desired_samples(desired_max_samples=desired_max_samples, apply_privacy_metrics=apply_privacy_metrics, algo_payload = algo_payload) print_rank('client={}: training loss={}'.format(client_id[0], train_loss), loglevel=logging.DEBUG) # Estimate gradient magnitude mean/var # Now computed when the sufficient stats are updated. assert 'sum' in trainer.sufficient_stats assert 'mean' in trainer.sufficient_stats trainer.train_loss = train_loss trainer.num_samples = num_samples trainer.algo_computation = algo_computation # Compute pseudo-gradient if not send_dicts: for p, data in zip(trainer.model.parameters(), model_parameters): data = to_device(data) p.grad = data - p.data payload = strategy.generate_client_payload(trainer) if send_gradients else None if config['server_config']['type'] == 'personalization': # Initialize convex weight alpha alpha= config['client_config'].get('convex_model_interp', 0.75) local_model = make_model(config['model_config']) train_dataloader = make_train_dataloader(data_config, data_path, task=task, clientx=0, data_strct=data_strct) local_optimizer = make_optimizer(client_config['optimizer_config'], local_model) # Make the trainer local_trainer = Trainer( model=local_model, optimizer=local_optimizer, ss_scheduler=ss_scheduler, train_dataloader=train_dataloader, server_replay_config=client_config, max_grad_norm=client_config['data_config']['train'].get('max_grad_norm', None), anneal_config=client_config['annealing_config'] if 'annealing_config' in client_config else None, num_skips_threshold=client_config[ 'num_skips_threshold'] if 'num_skips_threshold' in client_config else -1, ignore_subtask=client_config['ignore_subtask'] ) local_model_name = os.path.join(model_path, user + '_model.tar') local_alpha_name = os.path.join(model_path, user + '_alpha') if os.path.exists(local_model_name): print_rank('Loading Local Model .. {}'.format(local_model_name)) local_trainer.load(local_model_name, update_lr_scheduler=False, update_ss_scheduler=False) if os.path.exists(local_alpha_name): print_rank('Loading Alpha Weight .. {}'.format(local_model_name), loglevel=logging.INFO) alpha = torch.load(local_alpha_name) # Copy original model original_local_model = local_trainer.get_model() # Training begins here local_trainer.model.train() local_trainer.model.zero_grad() # Run Local Processing train_loss, num_samples = local_trainer.train_desired_samples(desired_max_samples=desired_max_samples, apply_privacy_metrics=False) print_rank('client={}, user:{}: LOCAL training loss={}'.format(client_id[0], user, train_loss), loglevel=logging.INFO) local_trainer.save( model_path=model_path, config=config, token=user) # Estimate the pseudo-gradient for local model for p, orig_param in zip(local_trainer.model.parameters(), original_local_model.parameters()): orig_param = orig_param.cuda() if torch.cuda.is_available() else orig_param p.grad = orig_param.data - p.data alpha= alpha_update(local_trainer.model, trainer.model, alpha, initial_lr) torch.save(alpha, local_alpha_name) local_trainer.model.zero_grad() # Mark that training (including post-processing) is finished end = time.time() client_stats['training'] = end - begin_training client_stats['full cost'] = end - begin print_rank(f'Client training cost {end - begin_training}s', loglevel=logging.DEBUG) print_rank(f'Client full cost {end - begin}s', loglevel=logging.DEBUG) # Create dictionary that is sent back to server client_output = { 'cs': client_stats, 'tl': train_loss, 'mg': trainer.sufficient_stats['mag'], 'vg': trainer.sufficient_stats['var'], 'ng': trainer.sufficient_stats['mean'], 'rg': trainer.sufficient_stats['norm'], 'ns': num_samples, 'pl': payload, } # Apply privacy metrics if privacy_metrics_config and privacy_metrics_config['apply_metrics']: print_rank('Applying privacy metrics', loglevel=logging.DEBUG) privacy_stats = {'Dropped clients': 0} batches = trainer.cached_batches trainer.cached_batches = [] gradients = extensions.privacy.unroll_network(model.named_parameters(), select_grad=True)[0] if privacy_metrics_config['apply_indices_extraction']: allowed_word_rank = privacy_metrics_config.get('allowed_word_rank', 9000) embed_dim, vocab_size = model_config['embed_dim'], model_config['vocab_size'] overlap, indices = privacy_metrics.extract_indices_from_embeddings(gradients, batches, embed_dim, vocab_size) max_overlap = privacy_metrics_config.get('max_allowed_overlap', None) if max_overlap is not None and overlap > max_overlap: print_rank('Removing this client because we extracted {}% words and the maximum allowed is {}%'.format(overlap * 100, max_overlap * 100)) client_output['wt'] = 0.0 privacy_stats['Dropped clients'] = 1 privacy_stats['Extracted indices percentage'] = overlap privacy_stats['Words percentage above ' + str(allowed_word_rank) + ' word rank'] = (indices > allowed_word_rank).mean() if len(indices)>0 else 0 if privacy_metrics_config['apply_leakage_metric']: print_rank('Applying leakage metric', loglevel=logging.DEBUG) orig_params = {n: p for (n, _), p in zip(trainer.model.named_parameters(), model_parameters)} max_ratio = np.exp(privacy_metrics_config['max_leakage']) optim_config = privacy_metrics_config['attacker_optimizer_config'] is_leakage_weighted = privacy_metrics_config['is_leakage_weighted'] leakage = privacy_metrics.practical_epsilon_leakage(orig_params, trainer.model, batches, is_leakage_weighted, max_ratio, optim_config) print_rank('privacy leakage: {}'.format(leakage), loglevel=logging.DEBUG) max_leakage = privacy_metrics_config.get('max_allowed_leakage', None) if max_leakage is not None and leakage > max_leakage: print_rank('Removing this client because the information leakage/practical epsilon is {} and the maximum allowed is {}'.format(leakage, max_leakage)) client_output['wt'] = 0.0 privacy_stats['Dropped clients'] = 1 privacy_stats['Practical epsilon (Max leakage)'] = leakage client_output['ps'] = privacy_stats client_output['ts'] = time.time() return client_output ================================================ FILE: core/config.py ================================================ # Note this import requires python 3.7+ # Do we want to commit to this? from __future__ import annotations from dataclasses import dataclass from collections.abc import MutableMapping from cerberus import Validator from importlib.machinery import SourceFileLoader from utils.utils import print_rank from importlib.machinery import SourceFileLoader import os # TODO everywhere: choose reasonable defaults. # TODO: decide where task should live as a setting, maybe its own TaskConfig # TODO: docstrings everywhere # TODO: Make ModelConfig a base class that different models inherit from # We could specify the modelconfig class in the config file, # like we do for model.py. The current implementation mixes NLG and BERT # TODO: DatasetConfig needs to be teased apart. # The main issue is we have *_data, list_of_train_data, train_data_server. # They all essentially perform the same function in different contexts. # also some no-longer-used parameters are still present. # TODO: it's not clear what MutableMapping methods need overrides- we # could probably just use the default implementation. # TODO: not all pytorch optimizers can handle amsgrad - we should # have distinct subclasses for the different optimizers def from_dict(cls, config): """ Helper function to convert a dict to a class """ return cls(**config) class Config(MutableMapping): """Base class for configuration classes.""" def get(self, k: str, default=None): result = getattr(self, k, default) if result is None: return default return result def lookup(self, s: str, default=None): toks = s.split('.') child = getattr(self, toks[0], default) if len(toks) == 1: return child if child is not None else default elif isinstance(child, Config): return child.lookup('.'.join(toks[1:]), default) else: return default def __getitem__(self, k): return getattr(self, k) def __setitem__(self, k, v): setattr(self, k, v) def __delitem__(self, k): delattr(self, k) def __iter__(self): return iter(self.__dict__) def __len__(self): return len(self.__dict__) def __contains__(self, k): return getattr(self, k, None) is not None def pop(self, k, default=None): result = self.get(k, default) if k in self: delattr(self, k) return result @dataclass class ModelConfig(Config): """Base class for Model configurations The model configuration specifies model architecture, parameters, and initialization settings. Attributes: model_type (str): The class name of the model to instantiate. eg GRU. model_folder (str): The relative path to the model.py file where model_type is defined. eg experiments/nlg_gru/model.py pretrained_model_path (str): The path to the pretrained model. If None, the model will be randomly initialized using the method defined in weight_init. """ model_type: str = None model_folder: str = None pretrained_model_path: str = None @staticmethod def from_dict(config) -> ModelConfig: """Searches the model folder for config.py and if it is found the model config is initialized from the class [model_type]Config""" cfg_path = os.path.dirname("./" + str(config['model_folder'])) + '/config.py' if os.path.exists(cfg_path): loader = SourceFileLoader('config', cfg_path).load_module() config_class = config['model_type'] + 'Config' try: config_type = getattr(loader, config_class) return from_dict(config_type, config) except AttributeError: print_rank(f"Config class {config_class} not found in {cfg_path}") raise else: print_rank(f"Warning: couldn't find {cfg_path}, falling back to dictionary.") return config @dataclass class BERTModelConfig(Config): """BERT model configuration The BERT configuration specifies huggingface-specific BERT model settings. Attributes: model_name (str): The name of the BERT model. eg bert-base-uncased. cache_dir (str): Tokenizer cache directory, will be created if it doesn't exist. use_fast_tokenizer (bool): Whether to use the fast tokenizer. mask_token (str): special token to use for masking. task (str): The task to use for BERT. eg mlm. past_index (int): The index of the past state in the BERT model's state dict. prediction_loss_only (bool): if False, also produce metrics for predictions and labels. process_line_by_line (bool): if True, process the input line-by-line. ToDo: * check how cache_dir is used- there's a risk of multiple processes reading/writing at the same time. * verify the meaning of past_index (thanks copilot) * document the difference when process_line_by_line is True vs False """ model_name: str = None cache_dir: str = None use_fast_tokenizer: bool = False mask_token: str = '' task: str = 'mlm' past_index: int | None = -2 prediction_loss_only: bool = False process_line_by_line: bool = False @staticmethod def from_dict(config) -> BERTModelConfig: return from_dict(BERTModelConfig, config) @dataclass class BERTTrainingConfig(Config): """BERT training configuration Configuration settings for BERT training. Attributes: seed (int): random seed for reproducibility. label_smoothing_factor (float): label smoothing factor. Applied label smoothing when the factor is non-zero. batch_size (int): batch size. max_seq_length (int): maximum input sequence length. """ seed: int | None = None label_smoothing_factor: float | None = None batch_size: int | None = None max_seq_length: int | None = None @staticmethod def from_dict(config) -> BERTTrainingConfig: return from_dict(BERTTrainingConfig, config) @dataclass class BERTConfig(Config): """BERT configuration Specifies the model and training configuration for huggingface modeling scenarios. Attributes: loader_type (str): loader type hint. eg 'text' model (BERTModelConfig): BERT model configuration. training (BERTTrainingConfig): BERT training configuration. """ loader_type: str = None model: BERTModelConfig = None training: BERTTrainingConfig = None @staticmethod def from_dict(config) -> BERTConfig: result = BERTConfig() for k in config: if k == 'model': result.model = BERTModelConfig.from_dict(config[k]) elif k == 'training': result.training = BERTTrainingConfig.from_dict(config[k]) else: setattr(result, k, config[k]) return result @dataclass class PrivacyConfig(Config): """Privacy configuration The privacy configuration specified differential privacy settings for the model. The user can choose between local or global DP. When local DP is enabled, a global epsilon can be computed by applying the RDP accountant (see extensions/privacy). The `eps` parameter is used to specify the privacy budget for local DP. Conversely, when global DP is enabled, `eps` is ignored and `global_sigma` directly specifies the global Gaussian noise. `max_grad` specifies the clipping parameter for local or global DP, `max_weight` specifies the clipping parameter for the local gradient aggregation weight (applies to softmax aggregation), and `weight_scaler` indicates how the aggregation weight is scaled before noise addition, and unscaled afterward. This enables a single eps/sigma parameter for both the gradient and its weight. Example: This example applies local DP with eps=1000. The global epsilon will be computing using Renyi DP accounting. .. code-block:: yaml dp_config: # Local dp clips and adds noise on the client and centrally accumulates the privacy budget. enable_local_dp: true eps: 100 # epsilon max_grad: 0.008 # max gradient # The max_weight and min_weight should be already scaled by weight_scaler # Because we scale down the weight using weight_scalar -> clip -> add noise -> scale back up. max_weight: 0.0001 weight_scaler: 0.0001 min_weight: 0.00009 Attributes: enable_local_dp (bool): whether to enable local DP. enable_global_dp (bool): whether to enable global DP. eps (float): the privacy budget for local DP. delta (float): the privacy delta parameter for local DP. global_sigma (float): the global Gaussian noise for global DP. max_grad (float): the gradient clipping parameter. max_weight (float): the aggregation weight clipping parameter. weight_scaler (float): the aggregation weight scaling parameter. min_weight (float): the minimum per-gradient aggregation weight. """ enable_local_dp: bool = False enable_global_dp: bool = False eps: float | None = None delta: float | None = None global_sigma: float | None = None max_grad: float | None = None max_weight: float | None = None weight_scaler: float | None = None min_weight: float | None = None @staticmethod def from_dict(config) -> PrivacyConfig: return from_dict(PrivacyConfig, config) @dataclass class PrivacyMetricsConfig(Config): """Privacy metrics configuration This optional feature computes local privacy metrics for computed gradients, and optionally filters gradients based on estimated privacy loss. Attributes: apply_metrics (bool): whether to compute privacy metrics. apply_indices_extraction (bool): whether to attempt local data reconstruction. allowed_word_rank (int): threshold for successful reconstruction. apply_leakage_metric (bool): whether to compute a privacy leakage metric based on the ratio of perplexities before and after local training. max_leakage (float): the maximum allowed privacy leakage before filtering adaptive_leakage_threshold (float): if non-zero, compute an adaptive leakage threshold based on the previous round of training. For example at 0.95, the max_leakage will be adjusted to reject 5% of gradients, based on the previous round of training. is_leakage_weighted (bool): scales the leakage by the maximum likelihood of the pre- and post- likelihood tensors. ie the worst-case leakage is weighted by the worst-case likelihood that we might encounter it. attacker_optimizer_config (OptimizerConfig): the optimizer configuration for the reconstruction attack. """ apply_metrics: bool = False apply_indices_extraction: bool = False allowed_word_rank: int | None = None apply_leakage_metric: bool = False max_leakage: float | None = None max_allowed_leakage: float | None = None adaptive_leakage_threshold: float | None = None is_leakage_weighted: bool = False attacker_optimizer_config: OptimizerConfig = None @staticmethod def from_dict(config) -> PrivacyMetricsConfig: result = PrivacyMetricsConfig() for k in config: if k == 'attacker_optimizer_config': result.attacker_optimizer_config = \ OptimizerConfig.from_dict(config[k]) else: setattr(result, k, config[k]) return result @dataclass class OptimizerConfig(Config): """Optimizer configuration Pass any pytorch-supported optimizer configuration. The object should include a `type` field which indicates the pytorch optimizer type that should be invoked. This will be stripped from the object before being passed to the Optimizer's init. """ type: str = None # Leave this open for any keyword arguments, so we don't break torch constructors # In the future we can limit keywords to torch-specific ones. # lr: float = 0.0 # weight_decay: float = 0.0 # amsgrad: bool = False @staticmethod def from_dict(config) -> OptimizerConfig: # needs its own from_dict so we can accomodate any fields result = OptimizerConfig() assert 'type' in config for k in config: setattr(result, k, config[k]) return result @dataclass class AnnealingConfig(Config): """Learning rate annealing configuration Attributes: type (str): the type of annealing. Supported methods: :code:`step_lr`, :code:`multi_step_lr`, :code:`rampup-keep-expdecay-keep`, :code:`val_loss`. step_interval (str): the interval at which to step the learning rate. Supported intevals: :code:`epoch`, :code:`batch`. gamma (float): the learning rate decay factor. step_size (int): the interval between annealing operations. """ type: str = None step_interval: str = None gamma: float | None = None step_size: int | None = None @staticmethod def from_dict(config) -> AnnealingConfig: return from_dict(AnnealingConfig, config) @dataclass class DatasetConfig(Config): # Common to all text (NLG, MLM) dataloaders batch_size: int | None = None loader_type: str = None prepend_datapath: bool = False num_workers: int | None = None desired_max_samples: int | None = None # Common to all client.train dataloaders list_of_train_data: str = None max_grad_norm: float | None = None # propose moving max_grad_norm to client config # Common to all server.train dataloaders. What is the difference? train_data: str = None train_data_server: str = None # Common to server.test dataloaders test_data: str = None # Common to server.val dataloaders val_data: str = None # Specific to NLG dataloaders tokenizer_type: str = None # Note tokenizer_type appears in NLG configs, but always set to 'not applicable' vocab_dict: str = None pin_memory: bool = False num_frames: int | None = None # num_frames is missing from NLG server.test dataloader max_batch_size: int | None = None max_num_words: int | None = None unsorted_batch: int | None = None utterance_mvn: bool = False # only present on NLG client.train dataloader # Specific to MLM dataloaders task: str = None mlm_probability: float | None = None tokenizer_type_fast: bool = False max_seq_length: int | None = None min_words_per_utt: int | None = None max_samples_per_user: int | None = None mask_token: str = None cache_dir: str = None @staticmethod def from_dict(config) -> DatasetConfig: return from_dict(DatasetConfig, config) @dataclass class DataConfig(Config): """Data configurations Client and server configs may each contain a data config, consisting of train, test, and validate datasets. A typical configuration will define test and validate in the server data config, while the training data is defined in the client config. Optionally, the server can have a training config which defines server-side training data. Attributes: train (DatasetConfig): the training dataset configuration. val (DatasetConfig): the validation dataset configuration. test (DatasetConfig): the test dataset configuration. """ train: DatasetConfig = None val: DatasetConfig = None test: DatasetConfig = None @staticmethod def from_dict(config) -> DataConfig: train = DatasetConfig.from_dict(config['train']) if 'train' in config else None val = DatasetConfig.from_dict(config['val']) if 'val' in config else None test = DatasetConfig.from_dict(config['test']) if 'test' in config else None return DataConfig( train, val, test ) @dataclass class ServerReplayConfig(Config): """Server replay configuration When server-side training data is defined, this config defines how it is applied after each client training round. Attributes: server_iterations (int): the number of iterations to run over server-side training data for. optimizer_config (OptimizerConfig): the optimizer configuration to use for the server. """ server_iterations: int ignore_subtask: bool optimizer_config: OptimizerConfig @staticmethod def from_dict(config) -> ServerReplayConfig: return ServerReplayConfig( config['server_iterations'], config['ignore_subtask'], OptimizerConfig.from_dict(config['optimizer_config']) ) @dataclass class RLConfig(Config): """Reinforcement learning configuration RL can be applied during dynamic gradient aggregation to speed up convergence. This configuration defines the settings for server-side RL to train the model for DGA. Attributes: marginal_update_RL (bool): whether to update the RL model when the loss is small. RL_path (str): the path to the RL model to train. RL_path_global (bool): whether the global training output path should be prepended to RL_path. model_descriptor_RL (str): string to append to the model filename. network_params (list): List of layer widths in the RL network. eg: 300,128,128,128,64,100 initial_epsilon (float): the initial epsilon value for the epsilon-greedy policy. final_epsilon (float): the final epsilon value for the epsilon-greedy policy. epsilon_gamma (float): the decay rate for the epsilon-greedy policy. max_replay_memorize_size (int): the maximum number of samples to store in the replay memory. minibatch_size (int): the size of the minibatch to use for training. gamma (float): the discount factor for the RL model. optimizer_config (OptimizerConfig): the optimizer configuration to use for the RL model. annealing_config (AnnealingConfig): the annealing configuration to use for the RL model. """ marginal_update_RL: bool = False RL_path: str = None RL_path_global: bool = False model_descriptor_RL: str = None network_params: list = None initial_epsilon: float | None = None final_epsilon: float | None = None epsilon_gamma: float | None = None max_replay_memory_size: int | None = None minibatch_size: int | None = None gamma: float | None = None optimizer_config: OptimizerConfig = None annealing_config: AnnealingConfig = None @staticmethod def from_dict(config) -> RLConfig: result = RLConfig() for k in config: if k == 'optimizer_config': result.optimizer_config = OptimizerConfig.from_dict(config[k]) elif k == 'annealing_config': result.annealing_config = AnnealingConfig.from_dict(config[k]) else: setattr(result, k, config[k]) return result @dataclass class ServerConfig(Config): """Server configuration The server configuration defines the server-side settings. Attributes: resume_from_checkpoint (bool): whether to resume training from a checkpoint. max_iterations (int): the maximum number of iterations (federated training rounds) to run. num_clients (int): the number of clients to use per training round. optimizer_config (OptimizerConfig): the optimizer configuration to use server-side. annealing_config (AnnealingConfig): the learning rate annealing configuration to use server-side. val_freq (int): the number of iterations between validation evaluation runs. rec_freq (int): the number of iterations between test evaluation runs. initial_val (bool): whether to run validation before initiating training. initial_rec (bool): whether to run test before initiating training. wantRL (bool): whether to train the RL model. RL (RLConfig): the RL configuration to use if wantRL is True. data_config (DataConfig): the data configuration to use server-side. type (str): the type of server. Currently this parameter is ignored and OptimizationServer is always used. However there is some validation code that checks for one of the following values: - model_averaging - optimization - model_optimization - cluster_finetuning - cluster_parallel aggregate_median (str): the aggregation method to use (DGA softmax, or mean). Note that this only applies when the global aggregation strategy is DGA. weight_train_loss (str): when softmax DGA is enabled, what metric to use for weighting. One of - train_loss - mag_var_loss - mag_mean_loss softmax_beta (float): the beta value to use for the softmax DGA. max_weight (float): the maximum allowed client weight. initial_lr_client (float): the initial learning rate for each client. lr_decay_factor (float): the client learning rate decay factor. best_model_criterion (str): The metric to choose when resetting to the best model so far. server_replay_config (ServerReplayConfig): the server replay configuration to use for any server-side training. """ resume_from_checkpoint: bool = False max_iteration: int | None = None num_clients_per_iteration: int | None = None optimizer_config: OptimizerConfig = None annealing_config: AnnealingConfig = None val_freq: int | None = None rec_freq: int | None = None initial_val: bool = True initial_rec: bool = True wantRL: bool = False RL: RLConfig = None data_config: DataConfig = None type: str = None aggregate_median: str = None weight_train_loss: str = None softmax_beta: float | None = None max_weight: float | None = None initial_lr_client: float | None = None lr_delay_factor: float | None = None best_model_criterion: str = 'loss' server_replay_config: ServerReplayConfig = None @staticmethod def from_dict(config) -> ServerConfig: result = ServerConfig() for k in config: if k == 'optimizer_config': result.optimizer_config = \ OptimizerConfig.from_dict(config[k]) elif k == 'annealing_config': result.annealing_config = \ AnnealingConfig.from_dict(config[k]) elif k == 'data_config': result.data_config = \ DataConfig.from_dict(config[k]) elif k == 'server_replay_config': result.server_replay_config = \ ServerReplayConfig.from_dict(config[k]) elif k == 'RL': result.RL = \ RLConfig.from_dict(config[k]) else: setattr(result, k, config[k]) return result @dataclass class ClientConfig(Config): """ Client configuration The client configuration defines the client-side settings. Attributes: meta_learning (str): Set to 'basic'. Currently ignored. stats_on_smooth_grad (bool): When true, gradient statistics are reset each round. Currently, it appears these statistics aren't used. ignore_subtask (bool): Used to determine which model loss to use. In most cases just set to False. num_skips_threshold (int): previously used to skip users, deprecated. copying_train_data (bool): has no effect. do_profiling (bool): whether to enable client-side profiling. data_config (DataConfig): the data configuration to use client-side. type (str): the type of client. Currently this parameter is ignored? meta_optimizer_config (OptimizerConfig): the optimizer configuration to use for meta-learning. optimizer_config (OptimizerConfig): the optimizer configuration to use for client-side training. annealing_config (AnnealingConfig): the learning rate annealing configuration to use client-side. """ meta_learning: str = None stats_on_smooth_grad: bool = False ignore_subtask: bool = False num_skips_threshold: int | None = None copying_train_data: bool = False do_profiling: bool = False data_config: DataConfig = None type: str = None meta_optimizer_config: OptimizerConfig = None optimizer_config: OptimizerConfig = None annealing_config: AnnealingConfig = None @staticmethod def from_dict(config) -> ClientConfig: result = ClientConfig() for k in config: if k == 'data_config': result.data_config = DataConfig.from_dict(config[k]) elif k == 'meta_optimizer_config': result.meta_optimizer_config = \ OptimizerConfig.from_dict(config[k]) elif k == 'optimizer_config': result.optimizer_config = \ OptimizerConfig.from_dict(config[k]) elif k == 'annealing_config': result.annealing_config = \ AnnealingConfig.from_dict(config[k]) else: setattr(result, k, config[k]) return result @dataclass class FLUTEConfig(Config): """ FLUTEConfig represents the global configuration for a training job. Attributes: model_config (ModelConfig): the model configuration to use. dp_config (PrivacyConfig): differential privacy configuration. strategy (str): Aggregation strategy, eg DGA or FedAvg. server_config (ServerConfig): the server configuration to use. client_config (ClientConfig): the client configuration to use. """ model_config: ModelConfig = None dp_config: PrivacyConfig = None privacy_metrics_config: PrivacyMetricsConfig = None strategy: str = None server_config: ServerConfig = None client_config: ClientConfig = None def validate(config): # Join paths in config file if config["server_config"]["wantRL"]: rl_path = config["server_config"]["RL"]["RL_path"] rl_path = os.path.join(config["output_path"],rl_path) if config["server_config"]["RL"].get("RL_path_global", True) \ else os.path.join(config["output_path"], config["experiment_name"],rl_path) if "pretrained_model_path" in config["model_config"]: config["model_config"]["pretrained_model_path"] = os.path.join(config["data_path"], config["model_config"]["pretrained_model_path"]) for section in ["server_config", "client_config"]: for mode in ['test','val','train']: if mode in config[section]["data_config"] and "vocab_dict" in config[section]["data_config"][mode]: config[section]["data_config"][mode]["vocab_dict"] = os.path.join(config['data_path'], config[section]["data_config"][mode]["vocab_dict"]) # TODO: Remove BERT specific parameters if 'BERT' in config['model_config']: if mode!= 'train': config['server_config']['data_config'][mode]['model_name_or_path'] = config['model_config']['BERT']['model']['model_name'] config['server_config']['data_config'][mode]['process_line_by_line'] = config['model_config']['BERT']['model']['process_line_by_line'] else: config['client_config']['data_config'][mode]['model_name_or_path'] = config['model_config']['BERT']['model']['model_name'] config['client_config']['data_config'][mode]['process_line_by_line'] = config['model_config']['BERT']['model']['process_line_by_line'] return config @staticmethod def from_dict(config) -> FLUTEConfig: # Validate schema in config file schema = eval(open('./core/schema.py', 'r').read()) v = Validator(schema) if not v.validate(config,schema): raise ValueError('Missing {} argumment in config file '.format(v.errors)) # Normalize default values original_config = config config = v.normalized(config) for section in ['server_config', 'client_config']: for mode in config[section]['data_config'].keys(): diff = config[section]['data_config'][mode].keys() - original_config[section]['data_config'][mode].keys() if len(diff) > 0: print_rank("Assigning default values for: {} in [{}][{}][data_config]".format(diff, section, mode)) dp_config = \ PrivacyConfig.from_dict(config['dp_config']) \ if 'dp_config' in config else None priv_metrics_config = \ PrivacyMetricsConfig.from_dict(config['privacy_metrics_config']) \ if 'privacy_metrics_config' in config else None strategy = config.get('strategy', 'DGA') return FLUTEConfig( ModelConfig.from_dict(config['model_config']), dp_config, priv_metrics_config, strategy, ServerConfig.from_dict(config['server_config']), ClientConfig.from_dict(config['client_config']) ) ================================================ FILE: core/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from torch.utils.data import DataLoader as PyTorchDataLoader from abc import ABC class BaseDataLoader(ABC, PyTorchDataLoader): '''This is a wrapper class for PyTorch dataloaders.''' def create_loader(self): '''Returns the dataloader''' return self ================================================ FILE: core/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from torch.utils.data import Dataset as PyTorchDataset from abc import ABC, abstractmethod class BaseDataset(ABC, PyTorchDataset): '''This is a wrapper class for PyTorch datasets.''' @abstractmethod def __init__(self,**kwargs): super(BaseDataset, self).__init__() @abstractmethod def __getitem__(self, idx, **kwargs): '''Fetches a data sample for a given key''' pass @abstractmethod def __len__(self): '''Returns the size of the dataset''' pass @abstractmethod def load_data(self,**kwargs): '''Wrapper method to read/instantiate the dataset''' pass ================================================ FILE: core/evaluation.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. ''' In this file we define the functions for running test and validation tasks inside the Server. ''' import logging import torch import numpy as np # Internal imports import core.federated as federated from core.client import Client from utils import print_rank # AzureML-related libs from azureml.core import Run run = Run.get_context() class Evaluation(): def __init__(self, config, model_path, process_testvalidate, idx_val_clients, idx_test_clients, single_worker): self.config = config self.model_path = model_path self.process_testvalidate = process_testvalidate self.server_type = config['server_config']['type'] self.idx_val_clients = idx_val_clients self.idx_test_clients = idx_test_clients self.send_dicts = config['server_config'].get('send_dicts', False) self.single_worker = single_worker super().__init__() def run(self, eval_list, req, metric_logger=None): '''Run test/validation taks depending on the modes received in the eval_list. Args: eval_list (arr): Contains the tasks to run. req (dict): information for test/val tasks metric_logger (callback, optional): callback used for logging. Defaults to None, in which case AML logger is used. ''' self.worker_trainer = req['worker_trainer'] if self.send_dicts: global_model_values = [self.worker_trainer.model.state_dict()[param_key].to(torch.device('cpu')) for param_key in self.worker_trainer.model.state_dict()] else: global_model_values = [p.data.to(torch.device('cpu')) for p in self.worker_trainer.model.parameters()] if 'tmp_unsup' in req: unsup_values = req['tmp_unsup'].values() sup_values = req['tmp_sup'].values() semisupervision_inference = True else: semisupervision_inference = False save_model = False if metric_logger is None: metric_logger = run.log for mode in eval_list: # Skipping validation round when RL is enabled if 'wantRL' in self.config['server_config'] and self.config['server_config']['wantRL'] and mode == "val": continue # Compute avg_loss and avg_acc self.metrics = self.run_distributed_inference(mode, global_model_values) req = self.initialize_req(req) if len(req) == 1 else req # Only if for semisupervision if semisupervision_inference: unsup_metrics = self.run_distributed_inference(mode, unsup_values) sup_metrics = self.run_distributed_inference(mode, sup_values) for key, value in unsup_metrics.items(): metric_logger(str("Unsup" +mode + " " + key).capitalize(), value['value']) print_rank('LOG UNSUP: {}_{}={}'.format(mode, key, value['value'])) for key, value in sup_metrics.items(): metric_logger(str("Sup" + mode + " " + key).capitalize(), value['value']) print_rank('LOG SUP: {}_{}={}'.format(mode, key, value['value'])) # Log metrics for key, value in self.metrics.items(): metric_logger(str(mode + " " + key).capitalize(), value['value']) print_rank('LOG: {}_{}={}: best_{}_{}={}'.format(mode, key, value['value'], mode, key, req[str("best_"+ mode + "_" + key)])) for key,value in self.metrics.items(): attr = str("best_"+ mode + "_" + key) if value['higher_is_better']: if self.metrics[key]['value'] > req[attr]: req[attr] = self.metrics[key]['value'] save_model = True else: if self.metrics[key]['value'] < req[attr]: req[attr] = self.metrics[key]['value'] save_model = True if save_model and mode == 'val': self.worker_trainer.save( model_path=self.model_path, token=str('best_'+ mode +'_'+key), config=self.config['server_config'] ) save_model = False return req def initialize_req(self, req): '''Update the keys, to have the same as metrics dictionary. This function is only used during itr=0 for initializing the req dictionary. Args: req (dict): Best results for all the metrics (e.g. best_val_acc). ''' for mode in ['test','val']: for key in self.metrics.keys(): attr = "best_"+ mode + "_" + key req[attr] = -1.0 if self.metrics[key]['higher_is_better'] else float('inf') return req def run_distributed_inference(self, mode, model): '''Call `run_distributed_evaluation` specifically for test or validation. This is just a helper function that fetches the clients depending on the mode and calls `run_distributed_evaluation` using that list. Args: mode (str): `test` or `val`. ''' if mode == 'val': clients = self.idx_val_clients elif mode == 'test': clients = self.idx_test_clients else: raise NotImplementedError('Unsupported mode: {}'.format(mode)) return self.run_distributed_evaluation(mode, clients, model) def run_distributed_evaluation(self, mode, clients, model): '''Perform evaluation using available workers. See also `process_test_validate` on federated.py. Args: mode (str): `test` or `val`. clients (list): clients for test/val round. ''' total = 0 self.logits = {'predictions': [], 'probabilities': [], 'labels': []} server_data = (0.0, model, 0) for result in self.process_testvalidate(clients, server_data, mode, self.single_worker): output, metrics, count = result val_metrics = {key: {'value':0, 'higher_is_better': False} for key in metrics.keys()} if total == 0 else val_metrics for key in val_metrics: val_metrics[key]['value'] += metrics[key]['value']* count val_metrics[key]['higher_is_better'] = metrics[key]['higher_is_better'] total+= count if output is not None: self.logits['predictions'].append(output['predictions']) self.logits['probabilities'].append(output['probabilities']) self.logits['labels'].append(output['labels']) if self.logits['probabilities'] and self.logits['predictions'] and self.logits['labels']: self.logits['predictions'] = np.concatenate(self.logits['predictions']) self.logits['probabilities'] = np.concatenate(self.logits['probabilities']) self.logits['labels'] = np.concatenate(self.logits['labels']) for key in val_metrics: val_metrics[key]['value'] = val_metrics[key]['value']/total self.losses = [val_metrics['loss']['value'], val_metrics['acc']['value']] # For compatibility with Server return val_metrics def make_eval_clients(dataset, config): '''Generator that yields clients for evaluation, continuously. Args: dataset (torch.utils.data.Dataset): used to get client's data config (dict): used for the client's constructor ''' total = sum(dataset.num_samples) clients = federated.size() - 1 if federated.size()>1 else federated.size() delta = total / clients + 1 threshold = delta current_users_idxs = list() current_total = 0 if config["server_config"]["type"] == "personalization": for i in range(len(dataset.user_list)): yield Client([i], config, False) else: for i in range(len(dataset.user_list)): current_users_idxs.append(i) count = dataset.num_samples[i] current_total += count if current_total > threshold: print_rank(f'sending {len(current_users_idxs)} users', loglevel=logging.DEBUG) yield Client(current_users_idxs, config, False) current_users_idxs = list() current_total = 0 if len(current_users_idxs) != 0: print_rank(f'sending {len(current_users_idxs)} users -- residual', loglevel=logging.DEBUG) yield Client(current_users_idxs, config, False) ================================================ FILE: core/federated.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import os import cProfile import logging import threading import torch import torch.distributed as dist import numpy as np from core.client import Client from utils import ( print_rank, print_profiler, to_device, ) COMMAND_UPDATE = 0 COMMAND_TRAIN = 1 COMMAND_TERMINATE = 10 COMMAND_TESTVAL = 11 COMMAND_SYNC_NODES = 9 GLOBAL_MESSAGE = None def encode_string(word, string_to_int = True): """ Encodes/Decodes the dictionary keys into an array of integers to be sent as tensors of the same shape during NCCL/Gloo P2P communication. Args: word (string/array): key to be encoded/decoded. string_to_int (bool): flag that indicates which action to perform. """ if string_to_int: # encode word = word.ljust(8, ' ') if len(word) < 8 else word # padding -- 8 is max length, all tensors must have the same size during communication word_encoded = [letter for letter in word.encode()] return word_encoded else: #decode cleanup_array = [letter for letter in word if letter!= 32] # Remove padding word_decoded = bytes(cleanup_array).decode() return word_decoded def rank(): """ Return rank of node. """ return int(os.environ['RANK']) def local_rank(): """ Return local rank of node. """ return int(os.environ['LOCAL_RANK']) def size(): """ Returns number of nodes in the distributed group, including server. """ return int(os.environ['WORLD_SIZE']) def _recv(x, src=0): """ Receives tensors with a single element or a list of tensors with the same shape during distributed communication. """ x = torch.tensor(x) if torch.is_tensor(x) == False else x x = to_device(x) dist.recv(tensor=x, src=src) x.to('cpu') try: return x.item() # single element except: return x.tolist() # list of tensors def _recv_gradients(src): """ Receives a list of tensors with different shape during distributed communication. """ n, n_dimensions, grads = 0, 0, [] # tensors intialization -- required by torch. n = _recv(n,src) for i in range(n): n_dimensions = _recv(n_dimensions,src) dimensions = [0 for i in range(n_dimensions)] dimensions = _recv(dimensions, src) print_rank(f"Received dimensions {dimensions}", loglevel=logging.DEBUG) param = to_device(torch.zeros(dimensions)) print_rank(f"Shape assigned {param.shape}", loglevel=logging.DEBUG) dist.recv(param,src) grads.append(param.detach().cpu()) torch.cuda.empty_cache() return grads def _send(x, dst=0): """ Send tensors with a single element or a list of tensors with the same shape during distributed communication. """ x = torch.tensor(x) x = to_device(x) dist.send(x, dst) del x torch.cuda.empty_cache() def _send_metrics(output): """ Organize the keys and values from the resulting dictionary from test/val rounds into arrays that are sent as independent tensors during distributed communication. """ keys = [encode_string(key) for key in output.keys()] values = [float(output[key]['value']) for key in output.keys()] higher_is_better = [int(output[key]['higher_is_better']) for key in output.keys()] # send the boolean as int _send(len(keys),0) _send(keys) _send(values) _send(higher_is_better) def _send_gradients(gradients, dst): """ Send a list of tensors with different shape during distributed communication. """ _send(len(gradients), dst) for i in gradients: dimensions = [int(d) for d in i.shape] _send(len(dimensions),dst) _send(dimensions,dst) param = to_device(i) dist.send(param,dst) del param torch.cuda.empty_cache() def _send_train_output(output): """ Organize the keys and values from the the returning ´client_output´ dictionary in ´Client.proces_round()´ function during training rounds, into arrays that are sent as independent tensors during distributed communication. """ cs_values = [float(cs_v) for cs_v in output['cs'].values()] # cs dict -- values are flatten in 1d array pl_values = [float(output['pl']['weight'])] # pl dict gradients = output['pl']['gradients'] # gradients are sent independently if len(output.keys()) > 9: # DP metrics ps_values = [float(ps_v) for ps_v in output['ps'].values()] values = cs_values + [float(output[key]) for key in output.keys() if key not in ['cs','pl','ps']] + pl_values + ps_values # reorganizing values in the order expected by the Server else: values = cs_values + [float(output[key]) for key in output.keys() if key not in ['cs','pl']] + pl_values # reorganizing values in the order expected by the Server # Send data _send(int(len(output.keys())),0) # Warn for number of keys _send(values, 0) _send_gradients(gradients, 0) def build_grads_dict(node): """ Reconstruct the dictionary ´client_output´ returned by ´Client.proces_round()´ function on the Server side during distributed communication. """ # Initialize tensors n_keys = 0 n_keys = _recv(n_keys,node) print(n_keys) if n_keys == 9: keys = ['cs','tl','mg','vg','ng','rg','ns','ts','pl'] values = [0.0 for i in range(11)] # initializing tensor shape -- 11 is fixed number of keys expected elif n_keys == 10: keys = ['cs','tl','mg','vg','ng','rg','ns','ts','pl','ps'] values = [0.0 for i in range(15)] # When the privacy metrics are enabled elif n_keys == 11: keys = ['cs','tl','mg','vg','ng','rg','ns','wt','ts','pl','ps'] values = [0.0 for i in range(16)] # When the privacy metrics are enabled # Read data values = _recv(values,node) grads = _recv_gradients(node) cs_values = [{key: values.pop(0) for key in ['setup','training','full cost']}] # recreating cs dict # Rebuilding original dictionary if n_keys == 9: pl_values = [{'weight':values.pop(), 'gradients': grads}] # recreating pl dict values_list = cs_values + [values.pop(0) for i in range(7)] + pl_values # 7 is fixed length for remaining items else: ps_values = [{key: values.pop() for key in ['Practical epsilon (Max leakage)','Words percentage above 9000 word rank','Extracted indices percentage','Dropped clients']}] pl_values = [{'weight':values.pop(), 'gradients': grads}] # recreating pl dict values_list = cs_values + [values.pop(0) for i in range(len(values))] + pl_values + ps_values result = dict(zip(keys,values_list)) # Cast values to original type for key in ['mg','vg','ng','rg']: result[key] = np.float32(result[key]) result['ns'] = int(result['ns'] ) return result def build_metrics_dict(node): """ Reconstruct the dictionary returned during test/val rounds on the Server side during distributed communication. """ # Initialize tensors n = 0 n = _recv(n,node) keys = [[0 for j in range(8)] for i in range(n)] # max_seq_len for metric name is 8 values = [0.0 for i in range(n)] higher_is_better = [0 for i in range(n)] # Read data keys = _recv(keys,node) values = _recv(values,node) higher_is_better = _recv(higher_is_better,node) # Reorganize output + decode dict keys orig_keys = [encode_string(key, string_to_int=False) for key in keys] values_dict = [{'value': float(v), 'higher_is_better': bool(higher_is_better[i])} for i, v in enumerate(values)] metrics = dict(zip(orig_keys,values_dict)) num_instances = int(metrics.pop('num')['value']) result = None, metrics, num_instances return result def receive_workers_output(node_request_map, results_list, free_nodes, command, idle_nodes): """ Receives the clients output on the Server side in async/sync mode. Asynchronous mode is only enabled when using NCCL backend given that Gloo does not provide native non-blocking implementation to check if the operation has been completed during distributed training""" if dist.get_backend() == "nccl": # Async for node, req in node_request_map: if req.is_completed(): result = build_metrics_dict(node) if command == COMMAND_TESTVAL else build_grads_dict(node) results_list.append(result) free_nodes.append(node) node_request_map.remove((node,req)) print_rank(f"Finished releasing the nodes {free_nodes}", loglevel=logging.DEBUG) else: # Sync print_rank(f"Waiting for a workers", loglevel=logging.DEBUG) gather_objects = [(None,None,None) for i in range(size())] output = [None for _ in gather_objects] dist.all_gather_object(output, gather_objects[rank()]) print_rank(f" All workers have finished ... taking the remaining clients {len(output)}", loglevel=logging.DEBUG) output = [e for i,e in enumerate(output) if i not in idle_nodes ] # Cleanup for idle workers results_list = results_list + output[1:] free_nodes = list(range(1, size())) return node_request_map, results_list, free_nodes def append_async_requests(node_request_map, node): """ Appends the asynchronous request sent to each worker during asynchronous training. """ ack = to_device(torch.tensor(1)) req = dist.irecv(tensor=ack, src=node) node_request_map.append((node,req)) return node_request_map def sync_idle_nodes(client_queue, free_nodes): """ Request dummy outputs to the odd (idle) nodes during synchronous training to prevent them to get trapped in the state of the previous iterations """ idle_nodes = [] if len(client_queue) == 0: print_rank(f"Free idle nodes {len(free_nodes)}", loglevel=logging.DEBUG) while len(free_nodes) > 0: node = free_nodes.pop() idle_nodes.append(node) _send(COMMAND_SYNC_NODES, node) return idle_nodes class Server: """Server object responsible for orchestration and aggregation. The Server is one of the two objects that may exist inside of a thread, all throughout its execution (the other being the Worker). At every round, the Server samples clients ids and send their data for an available Worker to process. The Workers then each produce a new model, and all models are sent to the Server for aggregation. The methods defined here are related to orchestration only, the aggregation will be done by a different object which inherits from this one. Notes: This class has no :code`__init__` method, and all its methods are static. It thus only serves the purpose of grouping the methods, but nothing is actually stored inside of the object. """ @staticmethod def dispatch_clients(clients, server_data, command, mode=None, do_profiling=False, single_worker=None): """Perform the orchestration between Clients and Workers. This function does the following: 1. It sends the server_data to all workers 2. For each available Worker: 2a. It sends the index of the client to instantiate 2c. It triggers the execution of the command on the Client. 3. Collect and return all client outputs. Notes: This function yields the gradients of different clients as they are received. Therefore, the order of the results generally does not correspond to the order of the clients. All commands used during Server-Worker communication must be float/integers given that torch.distributed only allows to send/recv tensors. Args: clients (list): list of clients to be processed. server_data (dict): server data sent to the workers and passed to clients, typically includes the global model at that step. command (int): instruction for worker to execute on the Client. mode (int): test/val only provided during evaluation rounds. do_profiling (bool): enables profiler during comunication. Returns: Generator of results. """ # Single GPU flag single_gpu = True if size()==1 else False print_rank(f"Single GPU flag Server: {single_gpu}", loglevel=logging.DEBUG) # Some cleanup torch.cuda.empty_cache() torch.cuda.synchronize() if torch.cuda.is_available() else None # Initialize communication profiler profiler = None if do_profiling: profiler = cProfile.Profile() profiler.enable() # Update lr + model parameters each round for all workers lr, model_params, nround = server_data if not single_gpu: for worker_rank in range(1, size()): _send(COMMAND_UPDATE, worker_rank) _send(lr,worker_rank) _send_gradients(model_params, worker_rank) _send(float(nround),worker_rank) print_rank(f"Finished sending lr {lr} and n_params {len(model_params)} to worker {worker_rank} - round {nround}", loglevel=logging.DEBUG) print_rank(f"Finished sending server_data to workers", loglevel=logging.DEBUG) client_queue = clients.copy() print_rank(f"Clients queue: {client_queue}", loglevel=logging.DEBUG) free_nodes = list(range(1, size())) results_list, node_request_map = [], [] # Initiate computation for all clients while client_queue: print_rank(f"Clients queue: {client_queue}", loglevel=logging.DEBUG) assert len(free_nodes) > 0 node = free_nodes.pop() index = len(client_queue)-1 client_to_process = client_queue.pop(index) print_rank(f"Sending client {index} to worker {node}", loglevel=logging.DEBUG) _send(command, node) # The command should indicate the worker which function to run on the client if command == COMMAND_TESTVAL: _send(mode,node) # Only for test/val has a value _send(index, node) # Worker receives the index of the client to pop elif command == COMMAND_TRAIN: _send(client_to_process, node) print_rank(f"Finished assigning worker {node}, free nodes {free_nodes}", loglevel=logging.DEBUG) if dist.get_backend() == "nccl": append_async_requests(node_request_map, node) idle_nodes = None else: idle_nodes = sync_idle_nodes(client_queue, free_nodes) # Waits until receive the output from all ranks if not free_nodes: print_rank(f"Waiting for a workers, free nodes {free_nodes}, reqs_lst {node_request_map}", loglevel=logging.DEBUG) while len(free_nodes) == 0: node_request_map, results_list, free_nodes = receive_workers_output(node_request_map, results_list, free_nodes, command, idle_nodes) for output in results_list: yield output results_list = [] # Wait for all workers to finish while (len(node_request_map)) != 0: node_request_map, results_list, free_nodes = receive_workers_output(node_request_map, results_list, free_nodes, command, idle_nodes) for output in results_list: yield output results_list = [] else: # For a single-GPU execution, there is no P2P communication in the same GPU. Using threats to coordinate. global GLOBAL_MESSAGE GLOBAL_MESSAGE = server_data if command == COMMAND_TESTVAL: t1 = threading.Thread(target=single_worker.trigger_evaluate) t1.start() t1.join() yield GLOBAL_MESSAGE elif command == COMMAND_TRAIN: total_clients = clients.copy() for client_id in total_clients: GLOBAL_MESSAGE = lr, model_params, nround, client_id t1 = threading.Thread(target=single_worker.trigger_train) t1.start() t1.join() result = GLOBAL_MESSAGE yield result if do_profiling: profiler.disable() print_profiler(profiler) # Some cleanup torch.cuda.empty_cache() torch.cuda.synchronize() if torch.cuda.is_available() else None @staticmethod def process_clients(clients, server_data, single_worker): """Ask workers to perform training on Clients. Args: clients (list): list of clients indexes sampled by ´Server.py´ object per iteration. server_data (dict): dictionary containing model. Returns: Generator of results. """ return Server.dispatch_clients(clients, server_data, COMMAND_TRAIN, single_worker=single_worker) @staticmethod def process_testvalidate(clients, server_data, mode, single_worker): """Ask workers to perform test/val on Clients. Args: clients (list): list of clients indexes for test/val rounds. server_data (dict): dictionary containing model. mode (str): test/val. Returns: Generator of results. """ mode = [-2] if mode == "test" else [2] return Server.dispatch_clients(clients, server_data, COMMAND_TESTVAL, mode, single_worker=single_worker) @staticmethod def terminate_workers(terminate=True): """Terminate the execution of the workers.""" if terminate: print_rank("Terminating worker processes") for worker_rank in range(1, size()): _send(COMMAND_TERMINATE, worker_rank) class Worker: """Worker object responsible for instantiate Clients based on incoming data from the Server and perform train/eval functions on it. Each worker lives on a different NCCL/Gloo thread and is assigned to a different GPU. Via the :code:`dispatch_clients` function, the Server passes to the Worker specific instructions to process clients' data, typically in order to generate a new model or to compute metrics. Attributes: model (torch.nn.Module): model being trained. data_path (str): path where all clients' data is located. do_profiling (bool): if True, analyzes execution in depth. val_clients (list): clients list for validation rounds. test_clients (list): clients list for testing rounds. config (dict): clients configuration. val_dataset (torch.utils.data.Dataset): validation dataset. test_dataset (torch.utils.data.Dataset): testing dataset. """ def __init__(self, model=None, data_path=None, do_profiling=False, val_clients= None, \ test_clients=None, config=None, val_dataset = None, test_dataset = None): self.model = model self.data_path = data_path self.do_profiling = do_profiling self.config = config self.val_clients = val_clients self.test_clients = test_clients self.val_dataset = val_dataset self.test_dataset = test_dataset def run(self): """Main loop executed by worker nodes. This method handles the NCCL/Gloo communication between the worker and the server. It keeps listening for commands from the Server, and performs different actions on the Client assigned depending on the command received. """ # Single GPU flag single_gpu = True if size()==1 else False print_rank(f"Single GPU flag Client: {single_gpu}", loglevel=logging.DEBUG) if not single_gpu: while True: # keeps listening for incoming server calls # Initialize tensors -- required by torch.distributed command, client_idx, mode = 0, 0, 0 # int lr, nround = torch.zeros(1), torch.zeros(1) # float # Read command command = _recv(command) print_rank(f"Command received {command} on worker {rank()}", loglevel=logging.DEBUG) # Receive server data -- lr, model_params if command == COMMAND_UPDATE: print_rank(f"COMMMAND_UPDATE received {rank()}", loglevel=logging.DEBUG) lr = _recv(lr, 0) model_params = _recv_gradients(0) nround = _recv(nround, 0) server_data = (lr, model_params, int(nround)) print_rank(f"Received lr: {lr} and n_params: {len(model_params)} - round {nround}", loglevel=logging.DEBUG) elif command == COMMAND_TRAIN: print_rank(f"COMMMAND_TRAIN received {rank()}", loglevel=logging.DEBUG) # Init profiler in training worker profiler = None if self.do_profiling: profiler = cProfile.Profile() profiler.enable() # Receive client id from Server client_idx = _recv(client_idx) print_rank(f"Cliend idx received from Server: {client_idx}", loglevel=logging.DEBUG) # Instantiate client client_to_process = Client( [client_idx], self.config, self.config['client_config']['type'] == 'optimization') # Execute Client.get_data() client_data = client_to_process.get_client_data() # Execute Client.process_round() output = client_to_process.process_round(client_data, server_data, self.model, self.data_path) # Send output back to Server if dist.get_backend() == "nccl": # ASYNC mode -- enabled only for nccl backend ack = to_device(torch.tensor(1)) dist.isend(tensor=ack, dst=0) _send_train_output(output) else: # SYNC mode -- gloo backend does not have a non-blocking way to check if the operation is completed gather_objects = [output for i in range(size())] output = [None for _ in gather_objects] dist.all_gather_object(output, gather_objects[rank()]) # Some cleanup torch.cuda.empty_cache() torch.cuda.synchronize() if torch.cuda.is_available() else None if self.do_profiling: profiler.disable() print_profiler(profiler) elif command == COMMAND_TESTVAL: print_rank(f"COMMMAND_TESTVAL received {rank()}", loglevel=logging.DEBUG) # Init profiler in validation worker profiler = None if self.do_profiling: profiler = cProfile.Profile() profiler.enable() # Receive mode and client id from Server mode = _recv(mode) mode = "test" if mode == -2 else "val" client_idx = _recv(client_idx) print_rank(f"Client idx received from Server: {client_idx}, {mode}", loglevel=logging.DEBUG) # Get client and dataset clients = self.val_clients if mode == "val" else self.test_clients dataset = self.val_dataset if mode == "val" else self.test_dataset clients_queue = clients.copy() assert 0 <= client_idx < len(clients_queue) client_to_process = clients_queue.pop(client_idx) # Execute Client.get_data() client_data = client_to_process.get_client_data(dataset) # Execute Client.run_testvalidate() output = client_to_process.run_testvalidate(client_data, server_data, mode, self.model) # Send output back to Server if dist.get_backend() == "nccl": # ASYNC mode -- enabled only for nccl backend _, metrics, num_instances = output metrics['num']= {'value': float(num_instances), 'higher_is_better': False} output = metrics print_rank(f"Worker {rank()} output {output}", loglevel=logging.DEBUG) ack = to_device(torch.tensor(1)) dist.isend(tensor=ack, dst=0) _send_metrics(output) else: # SYNC mode -- gloo backend does not have a non-blocking way to check if the operation is completed gather_objects = [output for i in range(size())] output = [None for _ in gather_objects] dist.all_gather_object(output, gather_objects[rank()]) print_rank(f"Worker {rank()} sent output back", loglevel=logging.DEBUG) # Some cleanup torch.cuda.empty_cache() torch.cuda.synchronize() if torch.cuda.is_available() else None if self.do_profiling: profiler.disable() print_profiler(profiler) elif command == COMMAND_TERMINATE: print_rank(f"COMMMAND_TERMINATE received {rank()}", loglevel=logging.DEBUG) # Some cleanup torch.cuda.empty_cache() torch.cuda.synchronize() if torch.cuda.is_available() else None return elif command == COMMAND_SYNC_NODES: # Only for sync calls print_rank(f"COMMMAND_SYNC_NODES received {rank()}", loglevel=logging.DEBUG) gather_objects = [None for i in range(size())] output = [None for _ in gather_objects] dist.all_gather_object(output, gather_objects[rank()]) print_rank(f"Worker IDLE {rank()} sent dummy output back", loglevel=logging.DEBUG) # Some cleanup torch.cuda.empty_cache() torch.cuda.synchronize() if torch.cuda.is_available() else None else: assert False, "unknown command" def trigger_evaluate(self): global GLOBAL_MESSAGE lr, model_params, nround = GLOBAL_MESSAGE server_data = (lr, model_params, int(nround)) mode = "val" # Get client and dataset clients = self.val_clients if mode == "val" else self.test_clients dataset = self.val_dataset if mode == "val" else self.test_dataset clients_queue = clients.copy() client_to_process = clients_queue.pop() # Execute Client.get_data() client_data = client_to_process.get_client_data(dataset) # Execute Client.run_testvalidate() output = client_to_process.run_testvalidate(client_data, server_data, mode, self.model) _, metrics, num_instances = output metrics['num']= {'value': float(num_instances), 'higher_is_better': False} GLOBAL_MESSAGE = (_, metrics, num_instances) # Some cleanup torch.cuda.empty_cache() torch.cuda.synchronize() if torch.cuda.is_available() else None def trigger_train(self): global GLOBAL_MESSAGE lr, model_params, nround, client_idx = GLOBAL_MESSAGE server_data = (lr, model_params, int(nround)) # Instantiate client client_to_process = Client([client_idx], self.config, self.config['client_config']['type'] == 'optimization') # Execute Client.get_data() client_data = client_to_process.get_client_data() # Execute Client.process_round() GLOBAL_MESSAGE = client_to_process.process_round(client_data, server_data, self.model, self.data_path) # Some cleanup torch.cuda.empty_cache() torch.cuda.synchronize() if torch.cuda.is_available() else None ================================================ FILE: core/metrics.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. ''' In this file we define the wrapper class for implementing metrics. ''' import logging import numpy as np import torch from utils import print_rank class Metrics(): def __init__(self): super().__init__() def compute_metrics(self,dataloader, model): '''This method is called by ´run_validation_generic´ function inside trainer.py . This is just a helper function that computes the metrics returned in the inference function inside ´model.py´. ''' print_rank("Computing metrics") return self.call_inference(dataloader,model) def call_inference(self, dataloader, model): metrics, sum_metrics = dict(), dict() output_tot = {"probabilities": [], "predictions": [], "labels":[]} counter = 0 with torch.no_grad(): for _, batch in enumerate(dataloader): val_loss = model.loss(batch).item() inf_results = model.inference(batch) inf_results ['loss'] = {'value': val_loss,'higher_is_better': False} output = inf_results.pop('output') batch_size = inf_results.pop('batch_size') for key in inf_results.keys(): if not isinstance(inf_results[key], dict): inf_results[key] = {'value':inf_results[key],'higher_is_better': True} sum_metrics[key] = [] if not key in sum_metrics else sum_metrics[key] if isinstance(output, dict): output_tot["probabilities"].append(output["probabilities"]) output_tot["predictions"].append(output["predictions"]) output_tot["labels"].append(output["labels"]) for q in inf_results.keys(): sum_metrics[q].append(inf_results[q]['value']* batch_size) counter += batch_size torch.cuda.empty_cache() output_tot["probabilities"] = np.concatenate(output_tot["probabilities"]) if output_tot["probabilities"] else [] output_tot["predictions"] = np.concatenate(output_tot["predictions"]) if output_tot["predictions"] else [] output_tot["labels"] = np.concatenate(output_tot["labels"]) if output_tot["labels"] else [] # Post-processing of metrics print_rank(f"validation complete {counter}", loglevel=logging.DEBUG) model.set_train() for k in inf_results.keys(): metrics[k] = inf_results[k] metrics[k]['value'] = sum(sum_metrics[k])/counter print_rank(f"validation examples {counter}", loglevel=logging.DEBUG) torch.cuda.empty_cache() return output_tot, metrics ================================================ FILE: core/model.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch as T from abc import ABC, abstractmethod class BaseModel(ABC, T.nn.Module): '''This is a wrapper class for PyTorch models.''' @abstractmethod def __init__(self,**kwargs): super(BaseModel, self).__init__() @abstractmethod def loss(self, input): '''Performs forward step and computes the loss Returns: torch: Computed loss. ''' pass @abstractmethod def inference(self, input): '''Performs forward step and computes metrics Returns: dict: The metrics to be computed. The following keys are the minimum required by FLUTE during evaluations rounds: - output - acc - batch_size More metrics can be computed by adding the key with a dictionary that includes the fields ´value´ and ´higher_is_better´ as follows: {'output':output, 'acc': accuracy, 'batch_size': n_samples, 'f1_score': {'value':f1,'higher_is_better': True}} ''' pass def set_eval(self): '''Bring the model into evaluation mode''' self.eval() def set_train(self): '''Bring the model into training mode''' self.train() ================================================ FILE: core/schema.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. # ''' # In this file we define the schema for the configuration # files that will be pass it to an instance of the Validator # in e2e_trainer.py # ''' { 'model_config':{ 'required': True, 'type': 'dict', 'allow_unknown': True, 'schema': { 'model_type': {'required': True, 'type':'string'}, 'model_folder': {'required': True, 'type':'string'}, 'BERT':{ 'required':False, 'type': 'dict', 'allow_unknown': True, 'schema':{ 'loader_type': {'required': False, 'type': 'string'}, 'model': { 'required': True, 'type': 'dict', 'allow_unknown': True, 'schema': { 'model_name_or_path': {'required': False, 'type':'string'}, 'model_name': {'required': True, 'type':'string'}, 'process_line_by_line': {'required': True, 'type':'boolean'}, } } } }, } }, 'dp_config':{ 'required': True, 'type': 'dict', 'allow_unknown': True, 'schema': { 'enable_local_dp': {'required': True, 'type':'boolean'}, 'enable_global_dp': {'required': False, 'type':'boolean'}, 'eps': {'required': False, 'type':'float'}, 'delta': {'required': False, 'type':'float'}, 'global_sigma': {'required': False, 'type':'float'}, 'max_grad': {'required': False, 'type':'float'}, 'max_weight': {'required': False, 'type':'float'}, 'weight_scaler': {'required': False, 'type':'float'}, 'min_weight': {'required': False, 'type':'float'}, } }, 'privacy_metrics_config':{ 'required': True, 'type': 'dict', 'allow_unknown': True, 'schema': { 'apply_metrics': {'required': True, 'type':'boolean'}, 'apply_indices_extraction': {'required': False, 'type':'boolean'}, 'allowed_word_rank': {'required': False, 'type':'integer'}, 'apply_leakage_metric': {'required': False, 'type':'boolean'}, 'max_leakage': {'required': False, 'type':'float'}, 'adaptive_leakage_threshold': {'required': False, 'type':'float'}, 'is_leakage_weighted': {'required': False, 'type':'boolean'}, 'attacker_optimizer_config': {'required': False, 'type':'dict', 'allow_unknown': True}, } }, 'strategy':{ 'required': True, 'type': 'string' }, 'server_config':{ 'required': True, 'type': 'dict', 'allow_unknown': True, 'schema': { 'wantRL': {'required': True, 'type':'boolean', 'allow_unknown': True}, 'RL': {'required': False, 'type':'dict'}, 'resume_from_checkpoint': {'required': True, 'type':'boolean'}, 'do_profiling': {'required': True, 'type':'boolean'}, 'optimizer_config': { 'required': True, 'type':'dict', 'allow_unknown': True, 'schema': { 'type': {'required': True, 'type':'string', 'allowed':['sgd', 'adam','adamax', 'lars', 'LarsSGD', 'lamb', 'adamW']}, 'lr': {'required': True, 'type':'float'}, 'weight_decay': {'required': False, 'type':'float'}, } }, 'annealing_config': { 'required': True, 'type':'dict', 'allow_unknown': True, 'schema': { 'type': {'required': True, 'type':'string'}, 'step_interval': {'required': True, 'type':'string'}, 'gamma': {'required': True, 'type':'float'}, 'step_size': {'required': True, 'type':'integer'}, } }, 'val_freq': {'required': False, 'type':'integer', 'default': 1}, 'rec_freq': {'required': False, 'type':'integer', 'default': 8}, 'initial_val': {'required': False, 'type':'boolean', 'default': True}, 'initial_rec': {'required': False, 'type':'boolean', 'default': False}, 'max_iteration': {'required': False, 'type':'integer', 'default': 10000}, 'num_clients_per_iteration': {'required': False, 'type':'integer', 'default': 1}, 'data_config': { 'required': True, 'type':'dict', 'allow_unknown': True, 'keysrules':{'forbidden':['num_clients']}, 'schema': { 'val': { 'required': True, 'type':'dict', 'allow_unknown': True, 'schema': { 'batch_size': {'required': False, 'type':'integer', 'default': 40}, 'val_data': {'required': True, 'type':'string', 'nullable':True}, 'tokenizer_type': {'required': False, 'type':'string'}, 'prepend_datapath': {'required': False, 'type':'boolean', 'default': False}, 'vocab_dict': {'required': False, 'type':'string'}, 'pin_memory': {'required': False, 'type':'boolean', 'default': True}, 'num_workers': {'required': False, 'type':'integer', 'default': 1}, 'num_frames': {'required': False, 'type':'integer', 'default': 0}, 'max_batch_size': {'required': False, 'type':'integer', 'default': 0}, 'max_num_words': {'required': False, 'type':'integer'}, 'max_grad_norm': {'required': False, 'type':'float', 'default': 5.0 }, 'unsorted_batch': {'required': False, 'type':'boolean', 'default': False}, 'cache_dir': {'required': False, 'type':'string'}, }, }, 'test': { 'required': True, 'type':'dict', 'allow_unknown': True, 'schema': { 'batch_size': {'required': False, 'type':'integer', 'default': 40}, 'test_data': {'required': True, 'type':'string', 'nullable': True}, 'tokenizer_type': {'required': False, 'type':'string'}, 'prepend_datapath': {'required': False, 'type':'boolean', 'default': False}, 'vocab_dict': {'required': False, 'type':'string'}, 'pin_memory': {'required': False, 'type':'boolean', 'default': True}, 'num_workers': {'required': False, 'type':'integer', 'default': 1}, 'num_frames': {'required': False, 'type':'integer', 'default': 0}, 'max_batch_size': {'required': False, 'type':'integer', 'default': 0}, 'max_num_words': {'required': False, 'type':'integer'}, 'max_grad_norm': {'required': False, 'type':'float', 'default': 5.0 }, 'unsorted_batch': {'required': False, 'type':'boolean', 'default': False}, 'cache_dir': {'required': False, 'type':'string'}, }, }, 'train': { 'required': False, 'type':'dict', 'allow_unknown': True, 'schema': { 'batch_size': {'required': False, 'type':'integer', 'default': 40}, 'train_data_server': {'required': False, 'type':'string'}, 'desired_max_samples': {'required': False, 'type':'integer'}, 'tokenizer_type': {'required': False, 'type':'string'}, 'prepend_datapath': {'required': False, 'type':'boolean', 'default': False}, 'vocab_dict': {'required': False, 'type':'string'}, 'pin_memory': {'required': False, 'type':'boolean', 'default': True}, 'num_workers': {'required': False, 'type':'integer', 'default': 1}, 'num_frames': {'required': False, 'type':'integer', 'default': 0}, 'max_batch_size': {'required': False, 'type':'integer', 'default': 0}, 'max_num_words': {'required': False, 'type':'integer'}, 'max_grad_norm': {'required': False, 'type':'float', 'default': 5.0 }, 'unsorted_batch': {'required': False, 'type':'boolean', 'default': False}, 'cache_dir': {'required': False, 'type':'string'}, } }, } }, 'type': { 'required': False, 'type':'string', 'allowed':['model_optimization', 'personalization'], 'default': 'model_optimization' }, 'aggregate_median': {'required': False, 'type':'string'}, 'initial_lr_client': {'required': True, 'type':'float'}, 'lr_decay_factor': {'required': True, 'type':'float'}, 'weight_train_loss': {'required': True, 'type':'string'}, 'best_model_criterion': {'required': False, 'type':'string', 'default':'loss'}, 'fall_back_to_best_model': {'required': False, 'type':'boolean', 'default': False}, 'softmax_beta': {'required': True, 'type':'float'}, 'server_replay_config': { 'required': False, 'type':'dict', 'schema':{ 'server_iterations': {'required': True, 'type':'integer'}, 'optimizer_config': { 'required': True, 'type':'dict', 'allow_unknown': True, 'schema': { 'type': {'required': True, 'type':'string', 'allowed':['sgd', 'adam','adamax', 'lars', 'LarsSGD', 'lamb', 'adamW']}, 'lr': {'required': True, 'type':'float'}, 'weight_decay': {'required': False, 'type':'float'}, 'amsgrad': {'required': False, 'type':'boolean'}, } }, } }, 'nbest_task_scheduler': { 'required': False, 'type':'dict', 'schema':{ 'num_tasks': {'required': True, 'type':'integer'}, 'iteration_per_task': {'required': True, 'type':'integer'}, } }, } }, 'client_config':{ 'required': True, 'type': 'dict', 'allow_unknown': True, 'schema': { 'meta_learning': {'required': False, 'type':'string'}, 'stats_on_smooth_grad': {'required': False, 'type':'boolean'}, 'ignore_subtask': {'required': True, 'type':'boolean'}, 'num_skips_threshold': {'required': False, 'type':'integer'}, 'copying_train_data': {'required': False, 'type':'boolean'}, 'do_profiling': {'required': True, 'type':'boolean'}, 'data_config': { 'required': True, 'type':'dict', 'allow_unknown': True, 'keysrules':{'forbidden':['num_clients']}, 'schema': { 'train': { 'required': True, 'type':'dict', 'allow_unknown': True, 'schema': { 'batch_size': {'required': False, 'type':'integer', 'default': 40}, 'list_of_train_data': {'required': True, 'type':'string', 'nullable': True}, 'tokenizer_type': {'required': False, 'type':'string'}, 'prepend_datapath': {'required': False, 'type':'boolean', 'default': False}, 'vocab_dict': {'required': False, 'type':'string'}, 'pin_memory': {'required': False, 'type':'boolean', 'default': True}, 'num_workers': {'required': False, 'type':'integer', 'default': 1}, 'num_frames': {'required': False, 'type':'integer', 'default': 0}, 'max_batch_size': {'required': False, 'type':'integer', 'default': 0}, 'max_num_words': {'required': False, 'type':'integer'}, 'max_grad_norm': {'required': False, 'type':'float', 'default': 5.0 }, 'unsorted_batch': {'required': False, 'type':'boolean', 'default': False}, } }, } }, 'type': { 'required': False, 'type':'string', 'allowed':['optimization', 'gradient_computation'], 'default': 'gradient_computation', }, 'meta_optimizer_config': { 'required': False, 'type':'dict', 'allow_unknown': True, 'schema': { 'type': {'required': True, 'type':'string', 'allowed':['sgd', 'adam','adamax', 'lars', 'LarsSGD', 'lamb', 'adamW']}, 'lr': {'required': True, 'type':'float'}, } }, 'optimizer_config': { 'required': True, 'type':'dict', 'allow_unknown': True, 'schema': { 'type': {'required': True, 'type':'string', 'allowed':['sgd', 'adam','adamax', 'lars', 'LarsSGD', 'lamb', 'adamW']}, 'lr': {'required': False, 'type':'float'}, 'weight_decay': {'required': False, 'type':'float'}, } }, 'annealing_config': { 'required': False, 'type':'dict', 'allow_unknown': True, 'schema': { 'type': {'required': True, 'type':'string'}, 'step_interval': {'required': True, 'type':'string'}, 'gamma': {'required': False, 'type':'float'}, 'step_size': {'required': False, 'type':'integer'}, } }, 'ss_config': {'required': False, 'type':'dict', 'allow_unknown': True}, } }, } ================================================ FILE: core/server.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. ''' In this file, we define the classes that live inside 'worker 0', the worker responsible for orchestration and aggregation. The main class is the OptimizationServer, which sends clients to the other workers to process and combines the resulting models. ''' import json import logging import os import random import shutil import time from collections import defaultdict import numpy as np import torch # Internal imports import core.federated as federated from core.evaluation import Evaluation from core.client import Client from .strategies import select_strategy from .trainer import ( ModelUpdater, Trainer, set_component_wise_lr, ) from utils import ( get_lr, print_rank, update_json_log, to_device, ) # For profiling import cProfile import pstats # AzureML-related libs from azureml.core import Run run = Run.get_context() class OptimizationServer(federated.Server): def __init__(self, num_clients, model, optimizer, ss_scheduler, data_path, model_path, server_train_dataloader, config, idx_val_clients, idx_test_clients, single_worker): '''Implement Server's orchestration and aggregation. This is the main Server class, that actually implements orchestration and aggregation, inheriting from `federated.Server`, which deals with communication only. The `train` method is central in FLUTE, as it defines good part of what happens during training. Args: num_clients (int): total available clients. model (torch.nn.Module): neural network model. optimizer (torch.optim.Optimizer): optimizer. ss_scheduler: scheduled sampling scheduler. data_path (str): points to where data is. model_path (str): points to where pretrained model is. server_train_dataloader (torch.utils.data.DataLoader): dataloader for training config (dict): JSON style configuration parameters idx_val_clients (list): validation client ids idx_test_clients (list): testing clients ids ''' super().__init__() # Initialize all attributes from arguments self.client_idx_list = list(range(num_clients)) self.config = config server_config = config['server_config'] decoder_config = config.get('decoder_config', None) self.max_iteration = server_config['max_iteration'] self.do_clustering = server_config.get('clustering', False) self.send_dicts = server_config.get('send_dicts', False) self.num_clients_per_iteration = [int(x) for x in server_config['num_clients_per_iteration'].split(',')] \ if isinstance(server_config['num_clients_per_iteration'], str) \ else [server_config['num_clients_per_iteration']] self.val_freq = server_config['val_freq'] self.req_freq = server_config['rec_freq'] self.evaluation = Evaluation(config, model_path, self.process_testvalidate, idx_val_clients, idx_test_clients, single_worker) # TODO: does this need to be adjusted for custom metrics? self.metrics = dict() self.model_backup_freq = server_config.get('model_backup_freq', 100) self.worker_trainer_config = server_config.get('trainer_config', {}) self.aggregate_median = server_config['aggregate_median'] self.initial_lr_client = server_config.get('initial_lr_client', -1.0) self.lr_decay_factor = server_config.get('lr_decay_factor', 1.0) self.model_type = config['model_config']['model_type'] self.quant_thresh = config['client_config'].get('quant_thresh', None) self.quant_bits = config['client_config'].get('quant_bits', 10) self.list_of_train_data = config['client_config']['data_config']['train']['list_of_train_data'] self.data_path = data_path self.single_worker = single_worker # Get max grad norm from data config if 'train' in server_config['data_config']: max_grad_norm = server_config['data_config']['train'].get('max_grad_norm', None) else: max_grad_norm = None # Creating an instance to update the model with stats aggregated from workers self.worker_trainer = ModelUpdater( model=model, optimizer=optimizer, ss_scheduler=ss_scheduler, train_dataloader=server_train_dataloader, val_dataloader=None, max_grad_norm=max_grad_norm, anneal_config=server_config['annealing_config'], model_type=self.model_type, decoder_config=decoder_config ) self.metrics['worker_trainer'] = self.worker_trainer # Creating an instance for the server-side trainer (runs mini-batch SGD) self.server_replay_iterations = None self.server_trainer = None if server_train_dataloader is not None: assert 'server_replay_config' in server_config, 'server_replay_config is not set' assert 'optimizer_config' in server_config[ 'server_replay_config'], 'server-side replay training optimizer is not set' self.server_optimizer_config = server_config['server_replay_config']['optimizer_config'] self.server_trainer_config = server_config['server_replay_config'].get('trainer_config', {}) self.server_replay_iterations = server_config['server_replay_config']['server_iterations'] self.server_trainer = Trainer( model=model, optimizer=None, ss_scheduler=ss_scheduler, train_dataloader=server_train_dataloader, server_replay_config=server_config['server_replay_config'], max_grad_norm=server_config['server_replay_config']\ .get('max_grad_norm',server_config['data_config']['train']\ .get('max_grad_norm',None)), anneal_config=server_config['server_replay_config'].get('annealing_config', None), ignore_subtask = server_config['server_replay_config'].get('ignore_subtask', False) ) self.skip_model_update = False # will not update the model if True self.train_loss = 0.0 self.model_path = model_path self.best_model_criterion = server_config['best_model_criterion'] self.fall_back_to_best_model = server_config['fall_back_to_best_model'] self.last_model_path = os.path.join(self.model_path, 'latest_model.tar') self.best_model_path = os.path.join(self.model_path, 'best_val_{}_model.tar'.format(self.best_model_criterion)) self.log_path = os.path.join(self.model_path, 'status_log.json') self.cur_iter_no = 0 # keep the iteration number for Tensor board plotting self.lr_weight = 1.0 self.losses = [] self.no_label_updates = 0 # no. label updates # Update the parameters above if the log file if server_config.get('resume_from_checkpoint', False): self.load_saved_status() # Decoding config self.decoder_config = decoder_config self.spm_model = server_config['data_config']['test'].get('spm_model', None) self.do_profiling = server_config.get('do_profiling', False) StrategyClass = select_strategy(config['strategy']) self.strategy = StrategyClass('server', self.config, self.model_path) print_rank(f'Server successfully instantiated strategy {self.strategy}', loglevel=logging.DEBUG) def load_saved_status(self): '''Load checkpoint from disk''' # Check if model is on disk, if so loads it onto trainer if os.path.exists(self.last_model_path): print_rank('Resuming from checkpoint model {}'.format(self.last_model_path)) self.worker_trainer.load(self.last_model_path, update_lr_scheduler=True, update_ss_scheduler=True) if self.server_trainer is not None: self.server_trainer.model = self.worker_trainer.model # make sure that the models are in sync # Check if log is on disk, if so loads it onto current stats if os.path.exists(self.log_path): with open(self.log_path, 'r') as logfp: # loading the iteration no., best loss and CER elems = json.load(logfp) self.cur_iter_no = elems.get('i', 0) self.metrics['best_val_loss'] = elems.get('best_val_loss', float('inf')) self.metrics['best_val_acc'] = elems.get('best_val_acc', 0) self.metrics['best_test_loss'] = elems.get('best_test_loss', float('inf')) self.metrics['best_test_acc'] = elems.get('best_test_acc', 0) self.lr_weight = elems.get('weight', 1.0) self.no_label_updates = elems.get('num_label_updates', 0) print_rank(f'Resuming from status_log: cur_iter: {self.cur_iter_no}') def run(self): '''Trigger training. This is a simple wrapper to the `train` method. ''' print_rank('server started') self.train() print_rank('server terminated') def train(self): '''Main method for training.''' self.run_stats = { 'secsPerClientRound': [], 'secsPerClient': [], 'secsPerClientTraining': [], 'secsPerClientSetup': [], 'secsPerClientFull': [], 'secsPerRoundHousekeeping': [], 'secsPerRoundTotal': [], 'communicationCosts': [] } run.log('Max iterations', self.max_iteration) try: self.worker_trainer.model = to_device(self.worker_trainer.model) # Do an initial validation round to understand the pretrained model's validation accuracy # Skip if we resumed from a checkpoint (cur_iter_no > 0) eval_list = [] if self.cur_iter_no == 0: if self.config['server_config']['initial_rec']: eval_list.append('test') if self.config['server_config']['initial_val']: eval_list.append('val') run.log('LR for agg. opt.', get_lr(self.worker_trainer.optimizer)) print_rank("Running {} at itr={}".format(eval_list, self.cur_iter_no)) self.metrics = self.evaluation.run(eval_list, self.metrics, metric_logger=run.log) eval_list = [] # some cleanup # Dump all the information in aggregate_metric print_rank('Saving Model Before Starting Training', loglevel=logging.INFO) for token in ['best_val_loss', 'best_val_acc', 'best_test_acc', 'latest']: self.worker_trainer.save( model_path=self.model_path, token=token, config=self.config['server_config'] ) # Training loop self.worker_trainer.model.train() for i in range(self.cur_iter_no, self.max_iteration): begin = time.time() metrics_payload = {} def log_metric(k, v): metrics_payload[k] = v print_rank('==== iteration {}'.format(i)) log_metric('Current iteration', i) # Initial value for the learning rate of the worker initial_lr = self.initial_lr_client * self.lr_weight print_rank('Client learning rate {}'.format(initial_lr)) # Run training on clients self.worker_trainer.model.zero_grad() self.train_loss = [] if self.send_dicts: # Send state dictionaries glob_payload = [self.worker_trainer.model.state_dict()[param_key].to(torch.device('cpu')) for param_key in self.worker_trainer.model.state_dict()] else: # Send parameters glob_payload = [p.data.to(torch.device('cpu')) for p in self.worker_trainer.model.parameters()] server_data = (initial_lr, glob_payload, i) # Random number of clients per iteration if len(self.num_clients_per_iteration) > 1: num_clients_curr_iter = random.randint( self.num_clients_per_iteration[0], self.num_clients_per_iteration[1] ) else: num_clients_curr_iter = self.num_clients_per_iteration[0] log_metric('Clients for round', num_clients_curr_iter) # Perform annealing in quantization threshold if self.quant_thresh is not None: self.config['client_config']['quant_thresh'] *= self.config['client_config'].get('quant_anneal', 1.0) self.quant_thresh = self.config['client_config']['quant_thresh'] log_metric('Quantization Thresh.', self.config['client_config']['quant_thresh']) # Create the pool of clients -- sample from this pool to assign to workers sampled_idx_clients = random.sample(self.client_idx_list, num_clients_curr_iter) if num_clients_curr_iter > 0 else self.client_idx_list # Initialize stats clients_begin = time.time() client_losses = [] client_mag_grads = [] client_mean_grads = [] client_var_grads = [] client_norm_grads = [] self.run_stats['secsPerClient'].append([]) self.run_stats['secsPerClientFull'].append([]) self.run_stats['secsPerClientTraining'].append([]) self.run_stats['secsPerClientSetup'].append([]) self.run_stats['communicationCosts'].append([]) # Check if we want privacy metrics apply_privacy_metrics = self.config.get('privacy_metrics_config', None) and \ self.config['privacy_metrics_config']['apply_metrics'] adaptive_leakage = apply_privacy_metrics and \ self.config['privacy_metrics_config'].get('adaptive_leakage_threshold', None) if apply_privacy_metrics: privacy_metrics_stats = defaultdict(list) # Initialize profiler profiler = None if self.do_profiling: profiler = cProfile.Profile() profiler.enable() # Reset gradient for the model before assigning the new gradients self.worker_trainer.model.zero_grad() print_rank(f"Clients sampled from server {sampled_idx_clients}", loglevel=logging.DEBUG) for client_output in self.process_clients(sampled_idx_clients, server_data, self.single_worker): # Process client output client_timestamp = client_output['ts'] client_stats = client_output['cs'] client_loss = client_output['tl'] client_mag_grad = client_output['mg'] client_mean_grad = client_output['ng'] client_var_grad = client_output['vg'] client_norm_grad = client_output['rg'] client_payload = client_output['pl'] if apply_privacy_metrics: privacy_stats = client_output['ps'] for metric, value in privacy_stats.items(): privacy_metrics_stats[metric].append(value) self.run_stats['communicationCosts'][-1].append(time.time() - client_timestamp) # Get actual pseudo-gradients for aggregation payload_processed = self.strategy.process_individual_payload(self.worker_trainer, client_payload) if not payload_processed: print_rank('Dropping client', loglevel=logging.DEBUG) num_clients_curr_iter -= 1 continue # Aggregate stats self.train_loss.append(client_loss) client_losses.append(client_loss) client_mag_grads.append(client_mag_grad.item()) client_mean_grads.append(client_mean_grad.item()) client_var_grads.append(client_var_grad.item()) client_norm_grads.append(client_norm_grad.item()) # Mark the end of client processing client_end = time.time() self.run_stats['secsPerClientFull'][-1].append(client_stats['full cost']) self.run_stats['secsPerClientTraining'][-1].append(client_stats['training']) self.run_stats['secsPerClientSetup'][-1].append(client_stats['setup']) self.run_stats['secsPerClient'][-1].append(client_end - clients_begin) # Tear down profiler if self.do_profiling: profiler.disable() stats = pstats.Stats(profiler) stats.sort_stats('cumulative').print_stats() # Prepare output client_mag_grads = np.array(client_mag_grads) client_mean_grads = np.array(client_mean_grads) client_var_grads = np.array(client_var_grads) client_norm_grads = np.array(client_norm_grads) client_stats = (client_mag_grads, client_mean_grads, client_var_grads) dump_norm_stats = self.config.get('dump_norm_stats', False) if dump_norm_stats: with open(os.path.join(self.model_path, 'norm_stats.txt'), 'a', encoding='utf-8') as outF: outF.write('{}\n'.format(json.dumps(list(client_norm_grads)))) # Print the privacy metrics if apply_privacy_metrics: for metric, values in privacy_metrics_stats.items(): if metric == 'Dropped clients': log_metric(metric, sum(values)) else: log_metric(metric, max(values)) if type(adaptive_leakage) is float: values = privacy_metrics_stats['Practical epsilon (Max leakage)'] new_threshold = list(sorted(values))[int(adaptive_leakage*len(values))] print_rank('Updating leakage threshold to {}'.format(new_threshold)) self.config['privacy_metrics_config']['max_allowed_leakage'] = new_threshold # Mark that all clients have been processed end = time.time() self.run_stats['secsPerClientRound'].append(end - begin) begin = end # Log the training loss to tensorboard/AML log_metric('Training loss', sum(self.train_loss)) # Combine payloads self.losses = self.strategy.combine_payloads( worker_trainer=self.worker_trainer, curr_iter=i, num_clients_curr_iter=num_clients_curr_iter, total_clients = len(self.client_idx_list), client_stats=client_stats, logger=log_metric, ) # Run a couple of iterations of training data on the server if self.server_trainer is not None: print_rank('Running replay iterations on server') if 'updatable_names' in self.server_trainer_config: set_component_wise_lr( self.worker_trainer.model, self.server_optimizer_config, self.server_trainer_config['updatable_names'] ) self.server_trainer.prepare_iteration(self.worker_trainer.model) self.server_trainer.train_desired_samples(self.server_replay_iterations) self.worker_trainer.model.load_state_dict(self.server_trainer.model.state_dict()) torch.cuda.empty_cache() # Update a sampling scheduler print_rank('Run ss scheduler') self.worker_trainer.run_ss_scheduler() # Run inference and score on val/test depending on the iter. number if ((i+1) % self.val_freq) == 0: eval_list.append("val") if ((i+1) % self.req_freq) == 0 : eval_list.append("test") if len(eval_list)> 0: print_rank('Running {} at itr={}'.format(eval_list,i+1)) self.metrics['worker_trainer'] = self.worker_trainer if hasattr(self.strategy,'tmp_unsup'): self.metrics['tmp_sup'] = self.strategy.tmp_sup self.metrics['tmp_unsup'] = self.strategy.tmp_unsup self.metrics = self.evaluation.run(eval_list, self.metrics, metric_logger=run.log) self.losses = self.evaluation.losses eval_list = [] # Create a schedule for the initial_lr (for the worker) if 'val' in eval_list: run.log('LR for agg. opt.', get_lr(self.worker_trainer.optimizer)) if not (self.losses[0] < self.metrics['best_val_loss']): self.lr_weight *= self.lr_decay_factor print_rank('LOG: Client weight of learning rate {}..'.format(self.lr_weight)) # Backup the current best models self.backup_models(i) # Fall back to the best model if the option is enabled self.fall_back_to_prev_best_status() # Logging the latest best values only after the 1st val/test round has been executed if len(self.metrics) > 1: update_json_log( self.log_path, { 'i': i + 1, 'best_val_loss': float(self.metrics['best_val_loss']), 'best_val_acc': float(self.metrics['best_val_acc']), 'best_test_loss': float(self.metrics['best_test_loss']), 'best_test_acc': float(self.metrics['best_test_acc']), 'weight': float(self.lr_weight), 'num_label_updates': int(self.no_label_updates) }, ) end = time.time() # Aggregate stats self.run_stats['secsPerRoundHousekeeping'].append(end - begin) self.run_stats['secsPerRoundTotal'].append(self.run_stats['secsPerClientRound'][-1] + \ self.run_stats['secsPerRoundHousekeeping'][-1]) log_metric('secsPerRoundTotal', self.run_stats['secsPerRoundTotal'][-1]) if self.do_profiling: log_metric('secsPerClientRound', self.run_stats['secsPerClientRound'][-1]) log_metric('secsPerRoundHousekeeping', self.run_stats['secsPerRoundHousekeeping'][-1]) metrics_for_stats = [ 'secsPerClient', 'secsPerClientTraining', 'secsPerClientFull', 'secsPerClientSetup', 'communicationCosts', ] for metric in metrics_for_stats: log_metric(f'{metric}Mean', np.mean(self.run_stats[metric][-1])) log_metric(f'{metric}Median', np.median(self.run_stats[metric][-1])) log_metric(f'{metric}Max', max(self.run_stats[metric][-1])) for k in self.run_stats: if k in metrics_for_stats: print_rank('{}: {}'.format(k, max(self.run_stats[k][-1])), loglevel=logging.DEBUG) else: print_rank('{}: {}'.format(k, self.run_stats[k][-1]), loglevel=logging.DEBUG) # Log all the metrics for k in metrics_payload: run.log(k, metrics_payload[k]) finally: # perform cleanup even if error was raised above self.terminate_workers(terminate=(not self.do_clustering)) def backup_models(self, i): '''Save the current best models. Save CER model, the best loss model and the best WER model. This occurs at a specified period. Args: i: no. of iterations. ''' # Always save the latest model self.worker_trainer.save( model_path=self.model_path, token='latest', config=self.config['server_config'], ) if (i % self.model_backup_freq) == 0: # save the current best models self.worker_trainer.save( model_path=self.model_path, token='epoch{}'.format(i), config=self.config['server_config'] ) for bodyname in ['best_val_acc', 'best_val_loss', 'best_test_acc']: src_model_path = os.path.join(self.model_path, '{}_model.tar'.format(bodyname)) if os.path.exists(src_model_path): dst_model_path = os.path.join(self.model_path, 'epoch{}_{}_model.tar'.format(i, bodyname)) shutil.copyfile(src_model_path, dst_model_path) print_rank('Saved {}'.format(dst_model_path)) def fall_back_to_prev_best_status(self): '''Go back to the past best status and switch to the recent best model.''' if self.fall_back_to_best_model: print_rank('falling back to model {}'.format(self.best_model_path)) # Save current learning rate tmp_lr = get_lr(self.worker_trainer.optimizer) # Load previous best model self.worker_trainer.load(self.best_model_path, update_lr_scheduler=False, update_ss_scheduler=False) # Update previous learning rate on optimizer for g in self.worker_trainer.optimizer.param_groups: g['lr'] = tmp_lr if self.server_trainer is not None: self.server_trainer.model = self.worker_trainer.model # make sure that the models are in sync def select_server(server_type): '''Select a server type using different possible strings. Right now this just returns `OptimizationServer`, but this function could be useful when there are multiple choices of server. Args: server_type (str): indicates server choice. config (dict): config parsed from YAML, passed so that parameters can be used to select a given server. ''' if server_type == "personalization": from experiments.cv.server import PersonalizationServer return PersonalizationServer else: return OptimizationServer ================================================ FILE: core/strategies/__init__.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from .base import BaseStrategy from .fedavg import FedAvg from .dga import DGA from .fedlabels import FedLabels def select_strategy(strategy): ''' Selects the aggregation strategy class NOTE: FedProx uses FedAvg weights during aggregation, which are proportional to the number of samples in each client. ''' if strategy.lower() == 'dga': return DGA elif strategy.lower() in ['fedavg', 'fedprox']: return FedAvg elif strategy.lower() == 'fedlabels': return FedLabels else: raise ValueError(f'cannot use strategy f{strategy}') ================================================ FILE: core/strategies/base.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from abc import abstractmethod @abstractmethod class BaseStrategy: def __init__(self, mode, config, model_path=None): '''Federated learning strategy Args: mode (str): which part the instantiated object should play, typically either :code:`client` or :code:`server`. config (dict): initial config dict. model_path (str): where to find model, needed for debugging only. ''' pass def generate_client_payload(self, trainer): '''Generate client payload Args: trainer (core.Trainer object): trainer on client. Returns: dict containing payloads in some specified format. ''' pass def process_individual_payload(self, worker_trainer, payload): '''Process client payload Args: worker_trainer (core.Trainer object): trainer on server (aka model updater). payload (dict): whatever is generated by :code:`generate_client_payload`. Returns: True if processed succesfully, False otherwise. ''' pass def combine_payloads(self, worker_trainer, curr_iter, num_clients_curr_iter, total_clients, client_stats, logger=None): '''Combine payloads to update model Args: worker_trainer (core.Trainer object): trainer on server (aka model updater). curr_iter (int): current iteration. num_clients_curr_iter (int): number of clients on current iteration. total_clients (int): size of total pool of clients (for privacy accounting) client_stats (dict): stats being collected. logger (callback): function called to log quantities. ''' pass ================================================ FILE: core/strategies/dga.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import copy import json import logging import math import os import numpy as np import torch from extensions import privacy, RL, quant_model from utils import compute_grad_cosines, print_rank, to_device from core.strategies import BaseStrategy from core.strategies.utils import ( aggregate_gradients_inplace, filter_weight, ) from azureml.core import Run run = Run.get_context() MIN_WEIGHT = 1e-7 class DGA(BaseStrategy): '''Dynamic Gradient Aggregation''' def __init__(self, mode, config, model_path=None): ''' Dynamic Gradient Aggregation (DGA) strategy. For more info see arXiv:2106.07578. Args: mode (str): which part the instantiated object should play, typically either :code:`client` or :code:`server`. config (dict): initial config dict. model_path (str): where to find model, needed for debugging only. ''' super().__init__(mode=mode, config=config, model_path=model_path) if mode not in ['client', 'server']: raise ValueError('mode in strategy must be either `client` or `server`') self.config = config self.model_path = model_path self.mode = mode # Parse config self.model_config = config['model_config'] self.client_config = config['client_config'] self.server_config = config['server_config'] self.dp_config = config.get('dp_config', None) if mode == 'client': self.stats_on_smooth_grad = self.client_config.get('stats_on_smooth_grad', False) self.quant_threshold = self.client_config.get('quant_thresh', None) self.quant_bits = self.client_config.get('quant_bits', 10) elif mode == 'server': self.dump_norm_stats = self.config.get('dump_norm_stats', False) self.aggregate_fast = self.server_config.get('fast_aggregation', False) self.want_rl = self.server_config.get('wantRL', False) self.stale_prob = self.server_config.get('stale_prob', 0.0) self.skip_model_update = False # Do some checks and create objects based on configs if self.aggregate_fast: print_rank('It is NOT possible to enable RL with fast_aggregation, RL is set to False', loglevel=logging.INFO) self.want_rl = False print_rank('It is NOT possible in Current Implementation to have stale gradients with fast_aggregation, stale_prob is set to 0.0', loglevel=logging.INFO) self.stale_prob = 0.0 if self.want_rl: self.rl = RL(config=self.server_config) # Initialize accumulators self.client_parameters_stack = [] self.client_parameters_stack_stale = [] self.client_weights = [] self.weight_sum_stale = 0.0 def generate_client_payload(self, trainer): '''Generate client payload Args: trainer (core.Trainer object): trainer on client. Returns: dict containing payloads in some specified format. ''' if self.mode != 'client': raise RuntimeError('this method can only be invoked by the client') # Get weights for aggregation, potentially using DGA weight = 1.0 add_weight_noise = False # Reset gradient stats and recalculate them on the smooth/pseudo gradient if self.stats_on_smooth_grad: trainer.reset_gradient_power() trainer.estimate_sufficient_stats() # If we are using softmax based on training loss, it needs DP noise if self.config['server_config']['aggregate_median'] == 'softmax': # This matters when DP is required add_weight_noise = True if 'weight_train_loss' not in self.config['server_config'] or self.config['server_config']['weight_train_loss'] == 'train_loss': training_weight = trainer.train_loss / trainer.num_samples elif self.config['server_config']['weight_train_loss'] == 'mag_var_loss': training_weight = trainer.sufficient_stats['var'] elif self.config['server_config']['weight_train_loss'] == 'mag_mean_loss': training_weight = trainer.sufficient_stats['mean'] else: training_weight = trainer.sufficient_stats['mag'] try: weight = math.exp(-self.config['server_config']['softmax_beta'] * training_weight) except: print_rank('There is an issue with the weight -- Reverting to {}'.format(MIN_WEIGHT), loglevel=logging.DEBUG) weight = MIN_WEIGHT weight = filter_weight(weight) # Add local DP noise here. # When weight == 0, something went wrong. So we'll skip adding noise and return a zero gradient. if weight > 0.0 and self.dp_config is not None and self.dp_config.get('enable_local_dp', False): weight = privacy.apply_local_dp(trainer, weight, self.dp_config, add_weight_noise) # In all other cases we can compute the weight after adding noise if not add_weight_noise: assert self.config['server_config']['aggregate_median'] == 'mean' assert weight == 1.0 # Weight the gradient and remove gradients of the layers we want to freeze for n, p in trainer.model.named_parameters(): p.grad = weight * p.grad if self.model_config.get('freeze_layer', None) and n == self.model_config['freeze_layer']: print_rank('Setting gradient to zero for layer: {}'.format(n), loglevel=logging.INFO) p.grad.mul_(0) # Gradient quantization step -- if quant_threshold is None, the code returns without doing anything quant_model(trainer.model, quant_threshold=self.quant_threshold, quant_bits=self.quant_bits, global_stats=False) payload = {} payload['weight'] = weight payload['gradients'] = [p.grad.to(torch.device('cpu')) for p in trainer.model.parameters()] return payload def process_individual_payload(self, worker_trainer, payload): '''Process client payload Args: worker_trainer (core.Trainer object): trainer on server (aka model updater). payload (dict): whatever is generated by :code:`generate_client_payload`. Returns: True if processed succesfully, False otherwise. ''' if self.mode != 'server': raise RuntimeError('this method can only be invoked by the server') if payload['weight'] == 0.0: return False self.client_weights.append(payload['weight']) if self.aggregate_fast: aggregate_gradients_inplace(worker_trainer.model, payload['gradients']) else: self.client_parameters_stack.append(payload['gradients']) return True def combine_payloads(self, worker_trainer, curr_iter, num_clients_curr_iter, total_clients, client_stats, logger=None): '''Combine payloads to update model Args: worker_trainer (core.Trainer object): trainer on server (aka model updater). curr_iter (int): current iteration. num_clients_curr_iter (int): number of clients on current iteration. total_clients (int): size of total pool of clients (for privacy accounting) client_stats (dict): stats being collected. logger (callback): function called to log quantities. Returns: losses, computed for use with LR scheduler. ''' if self.mode != 'server': raise RuntimeError('this method can only be invoked by the server') if self.want_rl: rl_model = self._run_rl_inference(self.client_weights, *client_stats) # Aggregation step if self.dump_norm_stats: cps_copy = [[g.clone().detach() for g in x] for x in self.client_parameters_stack] weight_sum = self._aggregate_gradients(worker_trainer, num_clients_curr_iter, self.client_weights, metric_logger=logger) print_rank('Sum of weights: {}'.format(weight_sum), loglevel=logging.DEBUG) torch.cuda.empty_cache() # Normalize with weight_sum for p in worker_trainer.model.parameters(): p.grad /= weight_sum if self.dump_norm_stats: cosines = compute_grad_cosines(cps_copy, [p.grad.clone().detach() for p in worker_trainer.model.parameters()]) with open(os.path.join(self.model_path, 'cosines.txt'), 'a', encoding='utf-8') as outfile: outfile.write('{}\n'.format(json.dumps(cosines))) # DP-specific steps privacy.apply_global_dp(self.config, worker_trainer.model, num_clients_curr_iter=num_clients_curr_iter, select_grad=True, metric_logger=logger) eps = privacy.update_privacy_accountant(self.config, total_clients, curr_iter=curr_iter, num_clients_curr_iter=num_clients_curr_iter) if eps: print_rank(f'DP result: {eps}') if self.skip_model_update is True: print_rank('Skipping model update') return # Run optimization with gradient/model aggregated from clients print_rank('Updating model') worker_trainer.update_model() print_rank('Updating learning rate scheduler') losses = worker_trainer.run_lr_scheduler(force_run_val=False) if self.want_rl: self._run_rl_training(curr_iter, rl_model, self.client_weights, *client_stats, logger) return losses def _aggregate_gradients(self, worker_trainer, num_clients_curr_iter, client_weights, metric_logger=None): '''Go through stored gradients, aggregate and put them inside model. Args: num_clients_curr_iter (int): how many clients were processed. client_weights: weight for each client. metric_logger (callback, optional): callback used for logging. Defaults to None, in which case AML logger is used. Returns: float: sum of weights for all clients. ''' weight_sum = 0 if metric_logger is None: metric_logger = run.log if not self.aggregate_fast: metric_logger('Stale Gradients Ratio', len(self.client_parameters_stack_stale) / num_clients_curr_iter) if len(self.client_parameters_stack_stale) > 0: weight_sum = self.weight_sum_stale for client_parameters in self.client_parameters_stack_stale: # Model parameters are already multiplied with weight on client, we only have to sum them up aggregate_gradients_inplace(worker_trainer.model, client_parameters) self.client_parameters_stack_stale = [] self.weight_sum_stale = 0 for client_weight, client_parameters in zip(client_weights, self.client_parameters_stack): if np.random.random() > self.stale_prob: # Model parameters are already multiplied with weight on client, we only have to sum them up aggregate_gradients_inplace(worker_trainer.model, client_parameters) else: self.weight_sum_stale += client_weight self.client_parameters_stack_stale.append(client_parameters) weight_sum += sum(client_weights) - self.weight_sum_stale # Some cleaning self.client_parameters_stack = [] self.client_weights = [] return weight_sum def _run_rl_inference(self, client_weights, client_mag_grads, client_mean_grads, client_var_grads): '''Uses RL to estimate weights, using DGA. Args: client_weights (numpy.ndarray): original weights for aggregation. client_mag_grads (numpy.ndarray): gradient stats for RL (magnitudes). client_mean_grads (numpy.ndarray): gradient stats for RL (means). client_var_grads (numpy.ndarray): gradient stats for RL (vars). Returns: list of torch.Tensor: parameters of model used to perform RL. ''' weight_sum = 0 original_model = copy.copy([p for p in self.worker_trainer.model.parameters()]) # Reinforcement learning for estimating weights print_rank('RL estimation of the aggregation weights', loglevel=logging.INFO) rl_weights = self.rl.forward( np.concatenate((client_weights, client_mag_grads, client_mean_grads, client_var_grads), axis=0)).cpu().detach().np() if rl_weights.ndim > 1: rl_weights = rl_weights[-1, :] rl_weights = np.exp(rl_weights) print_rank('RL Weights BEFORE filtering: {}'.format(rl_weights), loglevel=logging.DEBUG) index = np.argwhere(np.isnan(rl_weights)) rl_weights[index] = 0 index = np.argwhere(np.isinf(rl_weights)) rl_weights[index] = 0 print_rank('RL Weights AFTER filtering: {}'.format(rl_weights), loglevel=logging.DEBUG) for client_parameters, orig_weight, rl_weight in zip(self.client_parameters_stack, client_weights, rl_weights): # Model parameters are already multiplied with weight on client, we only have to sum them up for p, client_grad in zip(self.worker_trainer.model.parameters(), client_parameters): if p.grad is None: p.grad = to_device(client_grad) * rl_weight / orig_weight else: p.grad += to_device(client_grad) * rl_weight / orig_weight weight_sum += rl_weight # Normalize with weight_sum for p in self.worker_trainer.model.parameters(): p.grad /= weight_sum # Run optimization with gradient/model aggregated from clients self.worker_trainer.update_model() # Get the validation result back (rl_val_loss, rl_val_acc) = self.worker_trainer.run_lr_scheduler(force_run_val=True) # Save model and revert to previous one rl_model = copy.copy([p.data for p in self.worker_trainer.model.parameters()]) for p, p_ in zip(self.worker_trainer.model.parameters(), original_model): p.data = p_.data.detach().clone() # Set the current set of weights self.rl.set_weights(rl_weights) self.rl.set_losses((rl_val_loss, rl_val_acc)) # Return the resulting RL-based model return rl_model def _run_rl_training(self, iter, rl_model, client_weights, client_mag_grads, client_mean_grads, client_var_grads, metric_logger): '''Trains RL for estimating weights, following DGA recipe. Args: iter (int): current iteration. rl_model (list of torch.Tensor): parameters of model used to perform RL. client_weights (numpy.ndarray): original weights for aggregation. client_mag_grads (numpy.ndarray): gradient stats for RL (magnitudes). client_mean_grads (numpy.ndarray): gradient stats for RL (means). client_var_grads (numpy.ndarray): gradient stats for RL (vars). metric_logger (callback, optional): callback used for logging. Defaults to None, in which case AML logger is used. ''' # Get the validation result back if None in self.losses: self.losses = self.run_distributed_inference(mode='val') # Expected structure of batch print_rank('Performing RL training on the aggregation weights') if abs(self.losses[1] - self.rl.rl_losses[1]) < 0.001: reward = 0.1 print_rank( 'Iter:{} val_ACC={} rl_val_ACC={} reward={}'.format(iter, self.losses[1], self.rl.rl_losses[1], reward)) if 'marginal_update_RL' in self.config['server_config'] and \ self.config['server_config']['marginal_update_RL']: self.losses = self.rl.rl_losses for p, p_ in zip(self.worker_trainer.model.parameters(), rl_model): p.data= p_.data.detach().clone() elif (self.losses[1] - self.rl.rl_losses[1]) > 0: reward = 1.0 print_rank( 'Iter:{} val_ACC={} rl_val_ACC={} reward={}'.format(iter, self.losses[1], self.rl.rl_losses[1], reward)) self.losses = self.rl.rl_losses for p, p_ in zip(self.worker_trainer.model.parameters(), rl_model): p.data = p_.data.detach().clone() else: reward = -1.0 print_rank( 'Iter:{} val_ACC={} rl_val_ACC={} reward={}'.format(iter, self.losses[1], self.rl.rl_losses[1], reward)) # Taking the policy from a game-based RL batch = ( (np.concatenate((client_weights, client_mag_grads, client_mean_grads, client_var_grads), axis=0)), (self.rl.rl_weights), [reward] ) print_rank('RL Model Update -- Training') self.rl.train(batch) print_rank('RL State Saving') self.rl.save(iter) print_rank('RL logging') metric_logger('RL Running Loss', self.rl.runningLoss) metric_logger('RL Rewards', reward) ================================================ FILE: core/strategies/fedavg.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import json import logging import os import torch from utils import compute_grad_cosines, print_rank from core.strategies import BaseStrategy from core.strategies.utils import ( aggregate_gradients_inplace, ) from azureml.core import Run run = Run.get_context() class FedAvg(BaseStrategy): '''Federated Averaging''' def __init__(self, mode, config, model_path=None): '''Federated Averaging strategy. Args: mode (str): which part the instantiated object should play, typically either :code:`client` or :code:`server`. config (dict): initial config dict. model_path (str): where to find model, needed for debugging only. ''' super().__init__(mode=mode, config=config, model_path=model_path) if mode not in ['client', 'server']: raise ValueError('mode in strategy must be either `client` or `server`') self.config = config self.model_path = model_path self.mode = mode # Parse config self.model_config = config['model_config'] self.client_config = config['client_config'] self.server_config = config['server_config'] self.dp_config = config.get('dp_config', None) if mode == 'client': self.stats_on_smooth_grad = self.client_config.get('stats_on_smooth_grad', False) elif mode == 'server': self.dump_norm_stats = self.config.get('dump_norm_stats', False) self.aggregate_fast = self.server_config.get('fast_aggregation', False) self.skip_model_update = False # Initialize accumulators self.client_parameters_stack = [] self.client_weights = [] def generate_client_payload(self, trainer): '''Generate client payload Args: trainer (core.Trainer object): trainer on client. Returns: dict containing payloads in some specified format. ''' if self.mode != 'client': raise RuntimeError('this method can only be invoked by the client') # Reset gradient stats and recalculate them on the smooth/pseudo gradient if self.stats_on_smooth_grad: trainer.reset_gradient_power() trainer.estimate_sufficient_stats() # Weight the gradient and remove gradients of the layers we want to freeze weight = trainer.num_samples for n, p in trainer.model.named_parameters(): p.grad = weight * p.grad if self.model_config.get('freeze_layer', None) and n == self.model_config['freeze_layer']: print_rank('Setting gradient to zero for layer: {}'.format(n), loglevel=logging.INFO) p.grad.mul_(0) payload = {} payload['weight'] = weight payload['gradients'] = [p.grad.to(torch.device('cpu')) for p in trainer.model.parameters()] return payload def process_individual_payload(self, worker_trainer, payload): '''Process client payload Args: worker_trainer (core.Trainer object): trainer on server (aka model updater). payload (dict): whatever is generated by :code:`generate_client_payload`. Returns: True if processed succesfully, False otherwise. ''' if self.mode != 'server': raise RuntimeError('this method can only be invoked by the server') if payload['weight'] == 0.0: return False self.client_weights.append(payload['weight']) if self.aggregate_fast: aggregate_gradients_inplace(worker_trainer.model, payload['gradients']) else: self.client_parameters_stack.append(payload['gradients']) return True def combine_payloads(self, worker_trainer, curr_iter, num_clients_curr_iter, total_clients, client_stats, logger=None): '''Combine payloads to update model Args: worker_trainer (core.Trainer object): trainer on server (aka model updater). curr_iter (int): current iteration. num_clients_curr_iter (int): number of clients on current iteration. client_stats (dict): stats being collected. logger (callback): function called to log quantities. Returns: losses, computed for use with LR scheduler. ''' if self.mode != 'server': raise RuntimeError('this method can only be invoked by the server') # Aggregation step if self.dump_norm_stats: cps_copy = [[g.clone().detach() for g in x] for x in self.client_parameters_stack] weight_sum = self._aggregate_gradients(worker_trainer, num_clients_curr_iter, self.client_weights, metric_logger=logger) print_rank('Sum of weights: {}'.format(weight_sum), loglevel=logging.DEBUG) torch.cuda.empty_cache() # Normalize with weight_sum for p in worker_trainer.model.parameters(): p.grad /= weight_sum if self.dump_norm_stats: cosines = compute_grad_cosines(cps_copy, [p.grad.clone().detach() for p in worker_trainer.model.parameters()]) with open(os.path.join(self.model_path, 'cosines.txt'), 'a', encoding='utf-8') as outfile: outfile.write('{}\n'.format(json.dumps(cosines))) if self.skip_model_update is True: print_rank('Skipping model update') return # Run optimization with gradient/model aggregated from clients print_rank('Updating model') worker_trainer.update_model() print_rank('Updating learning rate scheduler') losses = worker_trainer.run_lr_scheduler(force_run_val=False) # TODO: Global DP. See dga.py return losses def _aggregate_gradients(self, worker_trainer, num_clients_curr_iter, client_weights, metric_logger=None): '''Go through stored gradients, aggregate and put them inside model. Args: num_clients_curr_iter (int): how many clients were processed. client_weights: weight for each client. metric_logger (callback, optional): callback used for logging. Defaults to None, in which case AML logger is used. Returns: float: sum of weights for all clients. ''' if metric_logger is None: metric_logger = run.log if not self.aggregate_fast: for client_parameters in self.client_parameters_stack: # Model parameters are already multiplied with weight on client, we only have to sum them up aggregate_gradients_inplace(worker_trainer.model, client_parameters) weight_sum = sum(client_weights) # Some cleaning self.client_parameters_stack = [] self.client_weights = [] return weight_sum ================================================ FILE: core/strategies/fedlabels.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import json import logging import os import torch import numpy as np from azureml.core import Run from core.strategies import BaseStrategy from utils import ( compute_grad_cosines, print_rank, to_device) run = Run.get_context() class FedLabels(BaseStrategy): '''FedLabels: Semi-supervision strategy.''' def __init__(self, mode, config, model_path=None): ''' Args: mode (str): which part the instantiated object should play, typically either :code:`client` or :code:`server`. config (dict): initial config dict. model_path (str): where to find model, needed for debugging only. ''' super().__init__(mode=mode, config=config, model_path=model_path) if mode not in ['client', 'server']: raise ValueError('mode in strategy must be either `client` or `server`') self.config = config self.model_path = model_path self.mode = mode self.model_config = config['model_config'] self.client_config = config['client_config'] self.server_config = config['server_config'] self.dp_config = config.get('dp_config', None) self.tmp_sup = None self.tmp_unsup = None if mode == 'client': self.stats_on_smooth_grad = self.client_config.get('stats_on_smooth_grad', False) elif mode == 'server': self.dump_norm_stats = self.config.get('dump_norm_stats', False) self.aggregate_fast = self.server_config.get('fast_aggregation', False) self.skip_model_update = False # Initialize accumulators self.client_parameters_stack = [] self.client_weights = [] def generate_client_payload(self, trainer): '''Generate client payload Args: trainer (core.Trainer object): trainer on client. unsup_dict (dict): unsupervised model state dictionary iteration (int): training round total_est_labels (int): labels generated Returns: dict containing payloads in some specified format. ''' unsup_dict = trainer.algo_computation if self.mode != 'client': raise RuntimeError('this method can only be invoked by the client') # Reset gradient stats and recalculate them on the smooth/pseudo gradient if self.stats_on_smooth_grad: trainer.reset_gradient_power() trainer.estimate_sufficient_stats() # Weight the gradient and preprocess state dictionaries from supervised and unsupervised model weight = 1 if trainer.num_samples == 0 else trainer.num_samples unsup_grads = [unsup_dict[param_tensor].to(torch.device('cpu')) for param_tensor in unsup_dict.keys()] sup_grads = [trainer.model.state_dict()[param_tensor].to(torch.device('cpu')) for param_tensor in trainer.model.state_dict().keys()] payload = {} payload['weight'] = weight payload['gradients'] = sup_grads + unsup_grads return payload def process_individual_payload(self, worker_trainer, payload): '''Process client payload Args: worker_trainer (core.Trainer object): trainer on server (aka model updater). payload (dict): whatever is generated by :code:`generate_client_payload`. Returns: True if processed succesfully, False otherwise. ''' if self.mode != 'server': raise RuntimeError('this method can only be invoked by the server') if payload['weight'] == 0.0: return False self.client_weights.append(payload['weight']) if self.aggregate_fast: aggregate_gradients_inplace(worker_trainer.model, payload['gradients']) else: self.client_parameters_stack.append(payload['gradients']) return True def combine_payloads(self, worker_trainer, curr_iter, num_clients_curr_iter, total_clients, client_stats, logger=None): '''Combine payloads to update model Args: worker_trainer (core.Trainer object): trainer on server (aka model updater). curr_iter (int): current iteration. num_clients_curr_iter (int): number of clients on current iteration. client_stats (dict): stats being collected. logger (callback): function called to log quantities. Returns: losses, computed for use with LR scheduler. ''' if self.mode != 'server': raise RuntimeError('this method can only be invoked by the server') # Aggregation step if self.dump_norm_stats: cps_copy = [[g.clone().detach() for g in x] for x in self.client_parameters_stack] weight_sum, self.tmp_sup, self.tmp_unsup = self._aggregate_gradients(worker_trainer, num_clients_curr_iter, self.client_weights, metric_logger=logger) print_rank('Sum of weights: {}'.format(weight_sum), loglevel=logging.DEBUG) torch.cuda.empty_cache() # Disjoint aggregation tmp_both = {} for param_key in self.tmp_unsup.keys(): tmp_both[param_key] = self.tmp_sup[param_key]/2 + self.tmp_unsup[param_key]/2 worker_trainer.model.load_state_dict(tmp_both) if self.dump_norm_stats: cosines = compute_grad_cosines(cps_copy, [p.grad.clone().detach() for p in worker_trainer.model.parameters()]) with open(os.path.join(self.model_path, 'cosines.txt'), 'a', encoding='utf-8') as outfile: outfile.write('{}\n'.format(json.dumps(cosines))) if self.skip_model_update is True: print_rank('Skipping model update') return # Run optimization with gradient/model aggregated from clients print_rank('Updating model') worker_trainer.update_model() print_rank('Updating learning rate scheduler') losses = worker_trainer.run_lr_scheduler(force_run_val=False) # TODO: Global DP. See dga.py return losses def _aggregate_gradients(self, worker_trainer, num_clients_curr_iter, client_weights, metric_logger=None): '''Go through stored gradients, aggregate and put them inside model. Args: num_clients_curr_iter (int): how many clients were processed. client_weights: weight for each client. metric_logger (callback, optional): callback used for logging. Defaults to None, in which case AML logger is used. Returns: float: sum of weights for all clients. dict: supervised model state dictionary. dict: unsupervised model state dicionary. ''' if metric_logger is None: metric_logger = run.log # Separate sup/unsup dictionaries from client payload sup_slice = int(len(self.client_parameters_stack[0])/2) keys = [key for key in worker_trainer.model.state_dict()] model_dicts = [client_dict[:sup_slice] for client_dict in self.client_parameters_stack] unsup_dicts = [client_dict[sup_slice:] for client_dict in self.client_parameters_stack] first = True tmp_sup, tmp_unsup = {}, {} # Compute radios for each model weight_sum = sum(client_weights) ratio_sup = 1/len(client_weights) ratio_unsup = np.array(client_weights)/weight_sum if not self.aggregate_fast: # Perform aggregation for supervised model for i, client_parameters in enumerate(model_dicts): first, tmp_sup = aggregate_gradients_inplace(keys, client_parameters, first, tmp_sup, ratio_sup) first = True # Perform aggregation for unsupervised model for j, client_parameters in enumerate(unsup_dicts): first, tmp_unsup = aggregate_gradients_inplace(keys, client_parameters, first, tmp_unsup, ratio_unsup[j]) # Some cleaning self.client_parameters_stack = [] self.client_weights = [] return weight_sum, tmp_sup, tmp_unsup def aggregate_gradients_inplace(keys, values, first, tmp, ratio): '''Aggregate list of tensors into model dictionary. Args: keys (list): state dictionary keys of model to which dictionaries will be summed. values (list): list of values to sum to model dictionary. first (bool): flag that indicates the first value in the dictionary. tmp (dict): model state dictionary that will be summed. ratio (float): radio to weight each client value. ''' for param_key, client_dict in zip (keys, values): if first: tmp[param_key] = to_device(client_dict) * ratio else: tmp[param_key] += to_device(client_dict) * ratio return False, tmp ================================================ FILE: core/strategies/utils.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import logging import numpy as np from utils import print_rank, to_device def filter_weight(weight): '''Handles aggregation weights if something messed them up''' print_rank('Client Weight BEFORE filtering: {}'.format(weight), loglevel=logging.DEBUG) if np.isnan(weight) or not np.isfinite(weight): weight = 0.0 elif weight > 100: weight = 100 print_rank('Client Weights AFTER filtering: {}'.format(weight), loglevel=logging.DEBUG) return weight def aggregate_gradients_inplace(model, gradients): '''Aggregate list of tensors into model gradients. Args: model (torch.nn.Module): model to which gradients will be summed. gradients (list): list of gradients to sum to model. ''' for p, client_grad in zip(model.parameters(), gradients): if p.grad is None: p.grad = to_device(client_grad) else: p.grad += to_device(client_grad) ================================================ FILE: core/trainer.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import logging import os import re import copy import random import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import DataLoader from core.metrics import Metrics from utils import \ get_lr, \ get_lr_all, \ make_optimizer, \ make_lr_scheduler, \ print_rank, \ torch_save, \ try_except_save, \ write_yaml from utils.utils import ( to_device, get_label_VAT) class TrainerBase: """Abstract class defining Trainer objects' common interface. Args: model (torch.nn.Module): model to be trained. train_dataloader (torch.utils.data.DataLoader): dataloader that provides the training data. optimizer: (torch.optim.Optimizer): optimizer that will be used to update the model. max_grad_norm (float): if not None, avg gradients are clipped to this norm; defaults to None. ignore_subtask (bool): ignore subtasks, defaults to True. model_type (str): what kind of model is used, defaults to :code:`LanguageModel`. decoder_config (dict or None): config for decoder, defaults to None. """ def __init__( self, model, train_dataloader, optimizer, max_grad_norm=None, ignore_subtask=True, model_type="LanguageModel", decoder_config=None ): self.model = model self.train_dataloader = train_dataloader self.optimizer = optimizer self.max_grad_norm = max_grad_norm self.model_type = model_type self.decoder_config = decoder_config self.step = 0 # count how many batches are processed self.ignore_subtask = ignore_subtask # ignore subtasks even if there are multiple task branches def epoch_boundary(self): '''Check if we are at the end of any given epoch.''' return self.step % len(self.train_dataloader.create_loader()) == 0 and self.step != 0 def train_desired_samples(self, desired_max_samples, apply_privacy_metrics): pass def save(self): pass def load(self): pass class ModelUpdater(TrainerBase): """Update the model, given the already computed gradient. This is a special kind of trainer, that actually does not use any data. Args: model (torch.nn.Module): model to be updated. optimizer (torch.optim.Optimizer): optimizer that will be used to update the model. ss_scheduler: scheduled sampler. train_dataloader: train dataloader, this is not actually used. val_dataloader: val dataloader, this is not actually used. max_grad_norm (float): avg gradients are clipped to this norm. anneal_config (dict): annealing configuration. model_type (str): what kind of model is used, defaults to :code:`LanguageModel`. decoder_config (dict): config for decoder, defaults to None. """ def __init__( self, model, optimizer, ss_scheduler, train_dataloader, val_dataloader, max_grad_norm, anneal_config, model_type="LanguageModel", decoder_config=None ): super().__init__( model=model, train_dataloader=train_dataloader, optimizer=optimizer, max_grad_norm=max_grad_norm, model_type=model_type, decoder_config=decoder_config ) self.val_dataloader = val_dataloader self.annealing_type = anneal_config["type"] if anneal_config is not None else None self.lr_scheduler = make_lr_scheduler(anneal_config, self.optimizer) self.ss_scheduler = ss_scheduler def update_model(self): """Update model parameters using pre-computed gradients.""" # Apply gradient clipping if self.max_grad_norm is not None: grad_norm = nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) print_rank(f"clipped norm: {grad_norm} to {min(grad_norm,self.max_grad_norm)}", logging.DEBUG) # Do optimizer step self.optimizer.step() self.optimizer.zero_grad() def run_lr_scheduler(self, force_run_val=False): """Update learning rate using scheduler.""" val_loss = val_acc = None if force_run_val is True or self.annealing_type == "val_loss": _, val_loss, val_acc = run_validation_generic(self.model, self.val_dataloader) # Do LR scheduling print_rank(f"LR all: {list(get_lr_all(self.optimizer))}", loglevel=logging.DEBUG) print_rank("LR BEFORE lr_scheduler step: {}".format(get_lr(self.optimizer))) if self.annealing_type == "val_loss": self.lr_scheduler.step(val_loss) else: self.lr_scheduler.step() print_rank("LR AFTER lr_scheduler step: {}".format(get_lr(self.optimizer)), loglevel=logging.DEBUG) return (val_loss, val_acc) def run_ss_scheduler(self): """Do scheduled sampling.""" if self.ss_scheduler is not None: self.ss_scheduler.step() def save(self, model_path, token=None, config=None): """Save model to disk.""" save_model( model_path=model_path, config=config, model=self.model, optimizer=self.optimizer, lr_scheduler=self.lr_scheduler, ss_scheduler=self.ss_scheduler, token=token ) def load(self, save_path, update_lr_scheduler, update_ss_scheduler): """Load model from disk. If save_path is given, load from there. If not, then resume training from current model dir. If at any point the save_path is not present on the disk, it won't be loaded. """ if os.path.isfile(save_path): print_rank("Loading checkpoint: {}".format(save_path)) checkpoint = torch.load(save_path) self.model.load_state_dict(checkpoint["model_state_dict"]) if self.optimizer is not None: self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) anl_st_dict = checkpoint.get("lr_scheduler_state_dict") if anl_st_dict and self.lr_scheduler is not None and update_lr_scheduler is True: self.lr_scheduler.load_state_dict(anl_st_dict) sss_st_dict = checkpoint.get("ss_scheduler_state_dict") if sss_st_dict and self.ss_scheduler is not None and update_lr_scheduler is True: self.ss_scheduler.load_state_dict(sss_st_dict) class Trainer(TrainerBase): """Perform training step for any given client. The main method to be called for triggering a training step is :code:`train_desired_samples`, which on its turn relies on :code:`run_train_epoch`. Args: model (torch.nn.Module): model to be trained. ss_scheduler: scheduled sampler. train_dataloader (torch.data.utils.DataLoader): dataloader that provides the training data. server_replay_config (dict or None): config for replaying training; defaults to None, in which case no replaying happens. optimizer (torch.optim.Optimizer or None): optimizer that will be used to update the model. If :code:`None`, skip optimization. max_grad_norm (float or None): if not None, avg gradients are clipped to this norm; defaults to None. anneal_config (dict or None): annealing configuration. num_skips_threshold (int): previously used to skip users, deprecated. ignore_subtask (bool): ignore subtasks, defaults to True. """ def __init__( self, model, ss_scheduler, train_dataloader, server_replay_config=None, optimizer=None, max_grad_norm=None, anneal_config=None, num_skips_threshold=-1, ignore_subtask=True ): super().__init__( model=model, train_dataloader=train_dataloader, optimizer=optimizer, max_grad_norm=max_grad_norm, ignore_subtask=ignore_subtask ) self.server_replay_config=None if server_replay_config is not None: self.server_replay_config = server_replay_config self.anneal_config=None if anneal_config is not None: self.anneal_config = anneal_config self.lr_scheduler = None if self.optimizer is None and self.server_replay_config is not None and "optimizer" in self.server_replay_config: self.optimizer = make_optimizer(self.server_replay_config["optimizer_config"], model) if self.optimizer is not None and self.anneal_config is not None: self.lr_scheduler = make_lr_scheduler( self.anneal_config, self.optimizer) self.cached_batches = [] self.ss_scheduler = ss_scheduler def reset_gradient_power(self): """Reset the sum of gradient power. This is used to compute statistics about the gradients. """ self.sum_grad = self.sum_grad2 = self.counter = 0 def accumulate_gradient_power(self): """Compute sum of gradient power. This is used to compute statistics about the gradients. """ for p in self.model.parameters(): if p.grad is None: continue grad = p.grad.detach().clone().cpu().numpy() p1 = np.sum(grad) p2 = np.sum(grad ** 2) n = p.grad.numel() self.sum_grad += p1 self.sum_grad2 += p2 self.counter += n print_rank("Magn. Grad. Squared: {}".format(self.sum_grad2), loglevel=logging.DEBUG) print_rank("Magn. Grad.: {}".format(self.sum_grad), loglevel=logging.DEBUG) return self.sum_grad, self.sum_grad2, self.counter def estimate_sufficient_stats(self): """Compute statistics about the gradients.""" sum_mean_grad, sum_mean_grad2, n = self.accumulate_gradient_power() mean_grad = sum_mean_grad / n mag_grad = np.sqrt(sum_mean_grad2 / n) var_grad = sum_mean_grad2 / n - mag_grad**2 norm_grad = np.sqrt(sum_mean_grad2) self.sufficient_stats = { "n": n, "sum": sum_mean_grad, "sq_sum": sum_mean_grad2, "var": var_grad, "mean": mean_grad, "mag": mag_grad, "norm": norm_grad } def train_desired_samples(self, desired_max_samples=None, apply_privacy_metrics=False, algo_payload = None): """Triggers training step. Args: desired_max_samples (int): number of samples that you would like to process. apply_privacy_metrics (bool): whether to save the batches used for the round for privacy metrics evaluation. Returns: 2-tuple of (float, int): total training loss and number of processed samples. """ num_samples = 0 total_train_loss = 0 algo_computation = None if algo_payload == None: num_samples_per_epoch, train_loss_per_epoch = self.run_train_epoch(desired_max_samples, apply_privacy_metrics) elif algo_payload['strategy'] == 'FedLabels': num_samples_per_epoch, train_loss_per_epoch, algo_computation = self.run_train_epoch_sup(desired_max_samples, apply_privacy_metrics, algo_payload) elif algo_payload['strategy'] == 'FedProx': num_samples_per_epoch, train_loss_per_epoch = self.run_train_epoch_fedprox(desired_max_samples, apply_privacy_metrics, algo_payload) num_samples += num_samples_per_epoch total_train_loss += train_loss_per_epoch return total_train_loss, num_samples, algo_computation def run_train_epoch(self, desired_max_samples=None, apply_privacy_metrics=False): """Implementation example for training the model. The training process should stop after the desired number of samples is processed. Args: desired_max_samples (int): number of samples that you would like to process. apply_privacy_metrics (bool): whether to save the batches used for the round for privacy metrics evaluation. Returns: 2-tuple of (int, float): number of processed samples and total training loss. """ sum_train_loss = 0.0 num_samples = 0 self.reset_gradient_power() # Reset gradient just in case self.model.zero_grad() train_loader = self.train_dataloader.create_loader() for batch in train_loader: if desired_max_samples is not None and num_samples >= desired_max_samples: break # Compute loss if self.optimizer is not None: self.optimizer.zero_grad() if self.ignore_subtask is True: loss = self.model.single_task_loss(batch) else: if apply_privacy_metrics: if "x" in batch: indices = to_device(batch["x"]) elif "input_ids" in batch: indices = to_device(batch["input_ids"]) self.cached_batches.append(indices) loss = self.model.loss(batch) loss.backward() # Apply gradient clipping if self.max_grad_norm is not None: grad_norm = nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) # Sum up the gradient power self.estimate_sufficient_stats() # Now that the gradients have been scaled, we can apply them if self.optimizer is not None: self.optimizer.step() print_rank("step: {}, loss: {}".format(self.step, loss.item()), loglevel=logging.DEBUG) # Post-processing in this loop # Sum up the loss sum_train_loss += loss.item() # Increment the number of frames processed already if "attention_mask" in batch: num_samples += torch.sum(batch["attention_mask"].detach().cpu() == 1).item() elif "total_frames" in batch: num_samples += batch["total_frames"] else: num_samples += len(batch["x"]) # Update the counters self.step += 1 # Take a step in lr_scheduler if self.lr_scheduler is not None: self.lr_scheduler.step() return num_samples, sum_train_loss def run_train_epoch_fedprox(self, desired_max_samples=None, apply_privacy_metrics=False, algo_payload=None): """Implementation example for training the model. The training process should stop after the desired number of samples is processed. Args: desired_max_samples (int): number of samples that you would like to process. apply_privacy_metrics (bool): whether to save the batches used for the round for privacy metrics evaluation. algo_payload (dict): hyperparameters needed to fine-tune FedProx algorithm. Returns: 2-tuple of (int, float): number of processed samples and total training loss. """ sum_train_loss = 0.0 num_samples = 0 self.reset_gradient_power() # Reset gradient just in case self.model.zero_grad() # FedProx parameters mu = algo_payload['mu'] global_model = to_device(copy.deepcopy(self.model)) global_weight_collector = list(global_model.parameters()) train_loader = self.train_dataloader.create_loader() for batch in train_loader: if desired_max_samples is not None and num_samples >= desired_max_samples: break # Compute loss if self.optimizer is not None: self.optimizer.zero_grad() if self.ignore_subtask is True: loss = self.model.single_task_loss(batch) else: if apply_privacy_metrics: if "x" in batch: indices = to_device(batch["x"]) elif "input_ids" in batch: indices = to_device(batch["input_ids"]) self.cached_batches.append(indices) loss = self.model.loss(batch) # Fedprox regularization term fed_prox_reg = 0.0 for param_index, param in enumerate(self.model.parameters()): fed_prox_reg += ((mu / 2) * torch.norm((param - global_weight_collector[param_index]))**2) loss += fed_prox_reg loss.backward() # Apply gradient clipping if self.max_grad_norm is not None: grad_norm = nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) # Sum up the gradient power self.estimate_sufficient_stats() # Now that the gradients have been scaled, we can apply them if self.optimizer is not None: self.optimizer.step() print_rank("step: {}, loss: {}".format(self.step, loss.item()), loglevel=logging.DEBUG) # Post-processing in this loop # Sum up the loss sum_train_loss += loss.item() # Increment the number of frames processed already if "attention_mask" in batch: num_samples += torch.sum(batch["attention_mask"].detach().cpu() == 1).item() elif "total_frames" in batch: num_samples += batch["total_frames"] else: num_samples += len(batch["x"]) # Update the counters self.step += 1 # Take a step in lr_scheduler if self.lr_scheduler is not None: self.lr_scheduler.step() return num_samples, sum_train_loss def run_train_epoch_sup(self, desired_max_samples=None, apply_privacy_metrics=False, algo_payload=None): """Implementation example for training the model using semisupervision. Args: desired_max_samples (int): number of samples that you would like to process. apply_privacy_metrics (bool): whether to save the batches used for the round for privacy metrics evaluation. algo_payload (dict): datasets and configuration used during training for the FedLabels algorithm. Returns: 3-tuple of (int, float, dict): number of processed samples, total training loss and unsupervised model state dict. """ sum_train_loss = 0.0 num_samples = 0 round_ = algo_payload['iter'] semisupervision_config = algo_payload['config'] self.reset_gradient_power() # Reset gradient just in case self.model.zero_grad() KL_pointLoss = torch.nn.KLDivLoss(reduction="none", log_target=True) MSELoss = torch.nn.MSELoss() Softmax = torch.nn.LogSoftmax(dim=1) nolog_Softmax = torch.nn.Softmax(dim=1) initial_net = copy.deepcopy(self.model) loss_func = torch.nn.CrossEntropyLoss() # Create datasets normal_dataset, unsupdataset, unsupdataset_rand = algo_payload['data'][0], algo_payload['data'][1], algo_payload['data'][2] self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.003, momentum=0) for i in range(int(semisupervision_config['train_ep'])): sup_train = DataLoader(normal_dataset, batch_size=64, shuffle=True) data_sup = iter(sup_train) (images, labels) = next(data_sup) self.model.zero_grad() labels = to_device(labels) log_probs = self.model(to_device(images)) loss = loss_func(log_probs, labels) num_samples+= len(labels) sum_train_loss += loss.item() loss.backward() self.optimizer.step() self.estimate_sufficient_stats() self.step += 1 # Update the counters print_rank("step: {}, loss: {}".format(self.step, loss.item()), loglevel=logging.DEBUG) net = copy.deepcopy(initial_net) optimizer = torch.optim.SGD(net.parameters(), lr=semisupervision_config['eta'], momentum=0) total_est_labels = 0 total_est_ratios = 0 correct = 0 if round_ >= semisupervision_config['burnout_round']: for _ in range(int(semisupervision_config['unsuptrain_ep'])): data_idx = random.sample(range(len(unsupdataset)), semisupervision_config['unl_bs']) partitioned = torch.utils.data.Subset(unsupdataset, indices=data_idx) ldr_train = DataLoader(partitioned, batch_size=semisupervision_config['bs'], shuffle=False) (images, true_labels) = next(iter(ldr_train)) images, true_labels = to_device(images), to_device(true_labels) initial_net.eval() self.model.eval() with torch.no_grad(): output_local = initial_net(images).detach() output_server = self.model(images).detach() local_logits = nolog_Softmax(output_local/semisupervision_config['temp']) server_logits = nolog_Softmax(output_server / semisupervision_config['temp']) est_labels, est_idx, est_var, est_ratio = get_label_VAT(local_logits, server_logits, semisupervision_config['thre'], semisupervision_config['comp']) total_est_labels += len(est_labels) total_est_ratios += est_ratio/semisupervision_config['unsuptrain_ep'] if len(est_labels) != 0: partitioned_rand = torch.utils.data.Subset(unsupdataset_rand, indices=data_idx) ldr_rand_train = DataLoader(partitioned_rand, batch_size=semisupervision_config['bs'], shuffle=False) (rand_images, _) = next(iter(ldr_rand_train)) rand_images = to_device(rand_images) correct += ((est_labels == true_labels[est_idx]).sum().item()) / ( len(est_idx) * semisupervision_config['unsuptrain_ep']) lamb_consist = semisupervision_config['vat_consis'] net.train() output = net(rand_images[est_idx]) if semisupervision_config['uda'] == 1 else net(images[est_idx]) output_norand = net(images[est_idx]) # Compute Losses, this should go inside model.py unsup_loss = loss_func(output, est_labels) kl_point_loss = KL_pointLoss(Softmax(output_norand / semisupervision_config['temp']), Softmax(output_server[est_idx]/semisupervision_config['temp'])) consist_loss = torch.tensor(0.0, requires_grad=True) consist_tmp = torch.tensor(0.0) for i in range(len(est_var)): if torch.argmax(local_logits[est_idx[i]]) == torch.argmax(server_logits[est_idx[i]]): dummy = kl_point_loss[i]*est_var[i] consist_tmp += 1 consist_loss = consist_loss+ dummy.sum() if consist_tmp != torch.tensor(0.0): consist_loss = consist_loss/consist_tmp l2_lambda = semisupervision_config['l2_lambda'] initial_net.eval() reg_loss = torch.tensor(0., requires_grad=True) for p, prev_param in zip(net.parameters(), initial_net.parameters()): reg_loss = reg_loss + MSELoss(p, prev_param) (semisupervision_config['unsup_lamb']*unsup_loss + lamb_consist*consist_loss+l2_lambda*reg_loss).backward(retain_graph=True) optimizer.step() return total_est_labels, sum_train_loss/semisupervision_config['ensize'], net.state_dict() def get_model(self): return copy.deepcopy(self.model) def prepare_iteration(self, model=None): """Steps to run before iteration begins.""" if model is not None: self.model.load_state_dict(model.state_dict()) self.lr_scheduler = None if self.optimizer is None and self.server_replay_config is not None and \ "optimizer_config" in self.server_replay_config: print_rank("Creating server-side replay training optimizer", loglevel=logging.DEBUG) self.optimizer = make_optimizer(self.server_replay_config["optimizer_config"], self.model) if self.optimizer is not None and self.anneal_config is not None: print_rank("Creating server-side replay-training lr_scheduler", loglevel=logging.DEBUG) self.lr_scheduler = make_lr_scheduler(self.anneal_config, self.optimizer) def reset_optimizer(self, optimizer_state_dict, annealing_config=None): """Re-load optimizer.""" assert self.optimizer is not None, "This trainer does not have an optimizer" # Load optimizer on state dict self.optimizer.load_state_dict(optimizer_state_dict) # Set learning rate scheduler self.lr_scheduler = None if annealing_config is not None: self.lr_scheduler = make_lr_scheduler(annealing_config, self.optimizer) def save(self, model_path, token=None, config=None): """Save model to disk.""" save_model( model_path=model_path, config=config, model=self.model, optimizer=self.optimizer, lr_scheduler=self.lr_scheduler, ss_scheduler=self.ss_scheduler, token=token ) def load(self, save_path, update_lr_scheduler, update_ss_scheduler): """Load model from disk. If save_path is given, load from there. If not, then resume training from current model dir. If at any point the save_path is not present on the disk, it won't be loaded. """ if os.path.isfile(save_path): print_rank("Loading checkpoint: {}".format(save_path)) checkpoint = torch.load(save_path) self.model.load_state_dict(checkpoint["model_state_dict"]) if self.optimizer is not None: self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) anl_st_dict = checkpoint.get("lr_scheduler_state_dict") if anl_st_dict and self.lr_scheduler is not None and update_lr_scheduler is True: self.lr_scheduler.load_state_dict(anl_st_dict) sss_st_dict = checkpoint.get("ss_scheduler_state_dict") if sss_st_dict and self.ss_scheduler is not None and update_lr_scheduler is True: self.ss_scheduler.load_state_dict(sss_st_dict) def run_validation_generic(model, val_dataloader): """Perform a validation step. Args: model (torch.nn.Module): model to be validated. val_dataloader (torch.data.utils.DataLoader): provides val data. Returns: Average validation loss. """ print_rank("run_validation_generic", loglevel=logging.DEBUG) model.set_eval() print_rank("set_eval", loglevel=logging.DEBUG) # Initialize dataloader etc. val_loader = val_dataloader.create_loader() print_rank( f"created loader {val_loader.num_workers}, " + \ f"users: {len(val_dataloader.dataset.user_list)} " + \ f"examples: {sum(val_dataloader.dataset.num_samples)} " + \ f"lendata: {len(val_loader)} ", loglevel=logging.DEBUG ) print_rank( f"drop_last: {val_loader.drop_last} " + \ f"len_sampler: {len(val_loader._index_sampler)}", loglevel=logging.DEBUG ) print_rank("Loading metrics ...", logging.DEBUG) metrics_cl = Metrics() return metrics_cl.compute_metrics(dataloader=val_loader, model=model) def set_component_wise_lr(model, optimizer_config, updatable_names): """Set zero learning rate for layers in order to freeze the update. Args: model (torch.nn.Module): optimizer_config (string): updatable_names (list): ["^dec_rnn", "^fc"] """ def name_matched(name, updatable_names): for updatable_name in updatable_names: if re.match(updatable_name, name) is not None: return True return False # Set learning rate to zero in layers which name does not follow regex parameters = [] for name, params in model.named_parameters(): if name_matched(name, updatable_names) is True: print_rank("updating {} with lr = {}".format(name, optimizer_config["lr"])) parameters.append({"params": params, "lr":optimizer_config["lr"]}) else: print_rank("freezing {}".format(name)) parameters.append({"params": params, "lr": 0.0}) return parameters def save_model(model_path, config, model, optimizer, lr_scheduler, ss_scheduler, token=None): """Save a model as well as training information.""" save_state = { "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict() if optimizer is not None else None, "lr_scheduler_state_dict": lr_scheduler.state_dict() if lr_scheduler is not None else None } if ss_scheduler is not None: save_state["ss_scheduler_state_dict"] = ss_scheduler.state_dict() if token: # just save as "best" and return save_path = os.path.join(model_path, "{}_model.tar".format(token)) else: save_path = os.path.join(model_path, "model.tar") print_rank("Saving model to: {}".format(save_path)) try_except_save(torch_save, state_or_model=save_state, save_path=save_path) # Write out the config to model_dir if config is not None: try_except_save(write_yaml, config=config, save_path=os.path.join(model_path, "config.yaml")) ================================================ FILE: doc/sphinx/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: doc/sphinx/advanced.rst ================================================ Advanced Topics =============== Privacy ------- Aggregation Options ------------------- Optimizer Options ----------------- ================================================ FILE: doc/sphinx/class_reference.rst ================================================ Class Reference =============== FLUTE Core ~~~~~~~~~~ core/server ----------- .. automodule:: core.server :members: :special-members: __init__ core/client ----------- .. automodule:: core.client :members: :special-members: __init__ core/federated -------------- .. automodule:: core.federated :members: :special-members: __init__ core/config ----------- .. automodule:: core.config :members: :special-members: __init__ ================================================ FILE: doc/sphinx/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # import os # import sys # sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- project = 'FLUTE' copyright = '2021, Microsoft Research' author = 'Microsoft Research' # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc' ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # #html_theme = 'alabaster' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] import sphinx_rtd_theme html_theme = 'sphinx_rtd_theme' html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] ================================================ FILE: doc/sphinx/index.rst ================================================ .. FLUTE documentation master file, created by sphinx-quickstart on Sat Jun 19 09:15:36 2021. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to FLUTE documentation! =============================== .. toctree:: :maxdepth: 2 :caption: Contents: overview scenarios launch advanced reference class_reference Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` ================================================ FILE: doc/sphinx/launch.rst ================================================ Launch FLUTE ================ Local run ------------ Install the requirements stated inside of requirements.txt. Ideally this sould be done inside of a virtual environment, for instance, using Anaconda. .. code:: bash conda create -n FLUTE python==3.8 pip install -r requirements.txt FLUTE uses torch.distributed API as its main communication backbone, supporting three buil-in backends. For more information please refer to [Distributed Communication Package](https://pytorch.org/docs/stable/distributed.html). Therefore, we highly suggest to use NCCL backend for distributed GPU training and Gloo for distributed CPU training. There is no `setup.py` as FLUTE is not currently distributed as a package, but instead meant to run from the root of the repository. After this initial setup you can use your data for launching a local run. However the following instructions will be adapted to run ``nlg_gru`` task. For running this example, you need to first download and preprocess the data. Instructions can be found `here`_. Once the data is available you can run FLUTE from root as follows: .. code:: bash python -m torch.distributed.run --nproc_per_node=3 e2e_trainer.py -dataPath ./testing/mockup -outputPath scratch -config testing/configs/hello_world_local.yaml -task nlg_gru -backend nccl .. _here: https://github.com/microsoft/msrflute/tree/main/testing If the setup of the experiment has been done correctly, after the model initialization we would be able to see the clients being trained: .. figure:: img/run.png :align: center :width: 800 Local run for nlg_gru task. AML Run ------------ FLUTE has a native integration for job submissions with Azure ML, allowing users to use the built-in CLI or web interface for job/experiment tracking. For running experiments on AzureML, the CLI can help. You should first install the CLI `install the CLI`_ (make sure you have v2) and `create a resource group and workspace`_. You can then create a compute cluster, type ``az ml compute create -h`` for more info. Afterwards, you should write a YAML file with instructions for the job; we provide a simple example below: .. _install the CLI: https://docs.microsoft.com/en-us/azure/machine-learning/reference-azure-machine-learning-cli .. _create a resource group and workspace: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace-cli?tabs=vnetpleconfigurationsv1cli%2Ccreatenewresources%2Cworkspaceupdatev1%2Cworkspacesynckeysv1%2Cworkspacedeletev1 .. code:: yaml experiment_name: basic_example description: Basic example of AML config for submitting FLUTE jobs code: local_path: . compute: azureml:Test environment: image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel inputs: data: folder: azureml://datastores/data/paths/cifar mode: rw_mount command: > apt -y update && apt -y install openmpi-bin libopenmpi-dev openssh-client && python3 -m pip install --upgrade pip && python3 -m pip install -r requirements.txt && python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -outputPath=./outputs -dataPath={inputs.data} -task=classif_cnn -config=./experiments/classif_cnn/config.yaml -backend=nccl You should replace ``compute`` with the name of the one you created before, and adjust the path of the datastore containing the data. In the example above, we created a datastore called ``data`` and added to it a folder called ``cifar``, which contained the two HDF5 files. The command passed above will install dependencies and then launch a NCCL job with 4 threads, for the experiment defined in ``experiments/classif_cnn``. Details on how to run a job using the AzureML CLI are given in its `documentation`_ , but typically it suffices to set up the environment and type ``az ml job create -f ``. In the same page of the documentation, you can also find more info about how to set up the YAML file above, in case other changes are needed. .. _documentation: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli .. note:: The local_path above is relative to the location of the YAML file. Setting it to ``.`` assumes it is in the same folder as ``e2e_trainer.py``. .. note:: All files on this folder will be uploaded to Azure, including hidden folders such as ``.git``, make sure to remove large files and folders that are not needed. After launching the experiment, you can follow it on AzureML Studio, which prints logs, plots metrics and makes the output easily available after the experiment is finished. ================================================ FILE: doc/sphinx/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=_build if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd ================================================ FILE: doc/sphinx/overview.rst ================================================ FLUTE Overview ============ FLUTE: Federated Learning Utilities and Tools for Experimentation is a high-performance open source platform that enables researchers and developers to perform rapid prototyping and offline simulations of novel federated learning algorithms at scale. An FLUTE job consists of one or more nodes (physical or virtual machines) executing a total of K workers that can become a Server or Client. .. figure:: img/client-server.png :align: center :width: 600 FLUTE uses a distributed processing architecture backed by torch.distributed. Worker 0 acts as a central orchestrator, maintaining and distributing the central model to workers, and subsequently distributing client tasks to them. On each training round the orchestrator takes care of: * Dispatch the central model to the rest of the workers * Queues up client tasks for workers to execute. Workers receive client tasks (client training data and training config) and: * Execute SGD on the central model using their client's training data * Send model delta (pseudo-gradient) back to the orchestrator. Each worker>0 processes client tasks sequentially, consisting of data encoding and one or more batch updates to the central model (note the central model is reset to its original state for each client task). As each client task completes, the model delta, aka the pseudo-gradient is sent back to the orchestrator for federation into a new central model. Execution runs for up to N training rounds. In each round the orchestrator may sample a subset of clients, and may also randomly delay pseudo-gradient updates from some clients to future rounds. The orchestrator will also periodically distribute evaluation tasks to determine model quality on validation and test data. .. note:: AzureML generally expects there will be one worker per GPU on each node. Architecture ------------ FLUTE design is based on a central server architecture. .. figure:: img/architecture.png :align: center :width: 500 FLUTE logical workflow. The logical workflow performed is: 1. Send and initial global model to clients. 2. Train instances of the global model with locally available data on each client. 3. Send training information to the Server (e.g. adapted models, logits, pseudo-gradients). 4. Combine the returned information on the server to produce a new model. 5. Optionally, update the logbal model with an additional server-side rehearsal step. 6. Send the updated global model back to the clients. 7. Repeat steps 2-6 after sampling a new subset of clients for the next training interation. ================================================ FILE: doc/sphinx/reference.rst ================================================ Option Reference ================ Command Line Arguments ---------------------- YAML Configuration ------------------ FLUTE yaml files consist of three main sections, and a few optional sections. The `model_config` specifies model architecture and pretrained model setup path. The `server_config` section defines server settings such as total training rounds, aggregation method, optimizer settings, learning rate schedule, and any server-side training data. The `client_config` section specifies client optimizer settings and the client-side training data. .. note:: Training data is loaded by the server and dispatched to the clients. The configuration settings for this data are specified in the `client_config`. model_config ~~~~~~~~~~~~ server_config ~~~~~~~~~~~~~ client_config ~~~~~~~~~~~~~ Optional Sections ----------------- In addition to the main sections, some optional sections may be specified to control privacy settings, specifically a `dp_config` section for differential privacy settings, and `privacy_metrics_config` for applying privacy metrics. dp_config ~~~~~~~~~ privacy_metrics_config ~~~~~~~~~~~~~~~~~~~~~~ ================================================ FILE: doc/sphinx/requirements.txt ================================================ sphinx_rtd_theme jinja2==3.0.3 ================================================ FILE: doc/sphinx/scenarios.rst ================================================ Adding New Scenarios ==================== Data Preparation ------------ FLUTE provides the abstract class `BaseDataset` inside ``core/dataset.py`` that can be used to wrap any dataset and make it compatible with the platform. The dataset should be able to access all the data, and store it in the attributes `user_list`, `user_data`, `num_samples` and `user_data_labels` (optional). These attributes are required to have these exact names. The abstract method ``load_data ()`` should be used to instantiate/load the dataset and provide the training format required by FLUTE on-the-fly. Here is a sample data blob for language model training. .. code:: json { "users": ["bert","elmo"], "user_data": { "bert": {"x": ["my name is Bert.", "I live with Ernie."]}, "elmo": {"x": ["Big Bird is my friend."]} }, "num_samples": [2, 1] } The blob consists of three fields: * ``users``: indicates a unique id for each user in the training data. Users are sampled uniformly to create client tasks during training. There could be many more users than client tasks per round or even over all client tasks over all rounds. * ``num_samples`` : indicates the number of samples for each user, in the same order as ``users`` list. That is, for any index ``i`` in ``range(len(data['users']))``: * ``user_data``: contains user-indexed training data. Each user's data is a dictionary of the form ``{"x": [list of examples]}``. If labels are needed by the task, ``user_data_label`` will be required by FLUTE with the user-indexed labels. The format should be similar to ``user_data`` where each user's label is a dictionary of the form ``{"x": [list of labels]}`` as follows: .. code:: json "user_data_label": { "bert": {"x": [ 0 , 1 ]}, "elmo": {"x": [ 0 ]} } .. note:: Test and validation data is formatted similarly. .. note:: Test/validate data is dispatched to workers by partitioning on users. If your test data isn't user-partitioned, we recommend partitioning it uniformly using some dummy user ids. Add the model to FLUTE -------------- FLUTE requires the model declaration framed in PyTorch, which must inhereit from the `BaseModel` class defined in ``core/model.py``. The following methods should be overridden: * __init__: model definition * loss: computes the loss used for training rounds * inference: computes the metrics used during evaluation rounds Please see the example provided below: .. code:: python from core.model import BaseModel class CNN(BaseModel): '''This is a PyTorch model with some extra methods''' def __init__(self, model_config): super().__init__() self.net = Net() def loss(self, input: torch.Tensor) -> torch.Tensor: '''Performs forward step and computes the loss''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) return F.cross_entropy(output, labels.long()) def inference(self, input): '''Performs forward step and computes metrics''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) n_samples = features.shape[0] accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item() f1 = f1_score(labels.cpu(), torch.argmax(output, dim=1).cpu(), average='micro') # NOTE: Only the keys 'output','acc' and 'batch_size' does not require # extra fields as 'value' and 'higher is better'. FLUTE requires this # format only for customized metrics. return {'output':output, 'acc': accuracy, 'batch_size': n_samples, \ 'f1_score': {'value':f1,'higher_is_better': True}} Once the model is ready, all mandatory files must be in a single folder inside ´{/experiments´. Please adjust your files with the following naming structure so FLUTE can be able to find all the scripts needed. .. code-block:: bash task_name |---- dataloaders |---- dataloader.py |---- dataset.py |---- utils |---- utils.py (if needed) |---- model.py |---- config.yaml |---- README.txt .. note:: In case you need to import a module that has not been considered in FLUTE, this can be added in requirements.txt .. note:: All files must contain only absolute imports, in order to avoid issues when running. Implement new metrics -------------- The metrics computed during the evaluation rounds are declared inside `inference()` in the model declaration. FLUTE requires this function to return a dictionary with at least `output`, `acc` and `batch_size` as follows: .. code:: bash { "output": loss, "acc": accuracy, "batch_size": batch_size} In order to add a new metric, we just need to add the key inside the same dictionary with the following format: .. code:: bash { "output": loss, "acc": accuracy, "batch_size": batch_size, "custom_metric_1": {"value": value1 ,'higher_is_better': True}, "custom_metric_2": {"value": value2 ,'higher_is_better': False}} Once the keys have been included in the returning dictionary from `inference()`, FLUTE will automatically recognize them during the test/val rounds. .. note:: Only the keys `output`, `acc` and `batch_size` does not require a dictionary. Create the configuration file --------------------------------- The configuration file will allow you to specify the setup in your experiment, such as the optimizer, learning rate, number of clients and so on. FLUTE requires the following 6 sections: * model_config: path an parameters (if needed) to initialize the model. * dp_config: differential privacy setup. * privacy_metrics_config: for cache data to compute additional metrics. * strategy: defines the federated optimizer. * server_config: determines all the server-side settings. * client_config: dictates the learning parameters for client-side model updates. The blob below indicates the basic parameters required by FLUTE to run an experiment: .. code:: yaml model_config: model_type: CNN # Class name in model.py model_folder: experiments/classif_cnn/model.py # Relative path to the model declaration dp_config: enable_local_dp: false # DP disabled privacy_metrics_config: apply_metrics: false # Privacy metrics disabled strategy: DGA # Federated optimizar (DGA or FedAvg) server_config: wantRL: false # Whether to use RL-based meta-optimizers resume_from_checkpoint: false # Restart from checkpoint if file exists do_profiling: false # Run profiler and compute runtime metrics optimizer_config: # Optimizer used to update the global model type: sgd lr: 1.0 annealing_config: # Annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 50 # Validation rounds frequency rec_freq: 100 # Testing rounds frequency initial_val: true # Enable initial validation round initial_rec: true # Enable initial testing round max_iteration: 2000 # Total of iteration rounds num_clients_per_iteration: 10 # Clients per interation data_config: # Information for the test/val dataloaders val: batch_size: 10000 val_data: test_data.hdf5 # Assign to null for data loaded on-the-fly test: batch_size: 10000 test_data: test_data.hdf5 # Assign to null for data loaded on-the-fly type: model_optimization # Server type (model_optimization is the only available for now) aggregate_median: softmax # How aggregations weights are computed initial_lr_client: 0.001 # Learning rate used on optimizer lr_decay_factor: 1.0 # Decay factor for LR weight_train_loss: train_loss # Determines how each client's weight is computed (e.g. grad_mean_loss, train_loss) best_model_criterion: f1_score # Determines the best model based on minimal loss, for checkpointing fall_back_to_best_model: false # If a model degrades, use the previous best model softmax_beta: 1.0 # Beta value to use for the softmax DGA client_config: do_profiling: false # Run profiling and compute runtime metrics ignore_subtask: false # Determines which model loss to use. In most cases just set to False. data_config: # Information for the train dataloader train: batch_size: 4 list_of_train_data: train_data.hdf5 # Assign to null for data loaded on-the-fly desired_max_samples: 50000 optimizer_config: # Optimizer used by the client type: sgd lr: 0.001 # This is overridden by `initial_lr_client` momentum: 0.9 type: optimization # The type of client (always set "optimization for now") .. note:: Documented templates for all the options available in the configuration files are provided inside configs folder. ================================================ FILE: e2e_trainer.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. ''' This is the main script to run on each NCCL/GLOO thread. It will spawn either a Server or Worker object -- the former is responsible for orchestrating and aggregating models, where as the latter processes clients' data to generate a new model. The Server lives on the very first thread, whereas remaining threads contain each a diferent Worker. ''' import argparse import os import shutil import yaml import logging from psutil import virtual_memory import torch import torch.distributed as dist from azureml.core import Run from core import federated from core.config import FLUTEConfig from core.server import select_server from core.client import Client from experiments import make_model from utils import ( make_optimizer, init_logging, print_rank, find_pretrained_model ) from utils.dataloaders_utils import ( make_train_dataloader, get_dataset, ) from core.evaluation import make_eval_clients def log_run_properties(config: FLUTEConfig): """Log parameters on AzureML. Args: config (dict): config containing parameters to log. """ properties = {} # Build properties dictionary mem = virtual_memory() properties["System memory (GB)"] = float(mem.total) / (1024**3) props = [ ("server_config.num_clients_per_iteration", 0), ("server_config.max_iteration", 0), ("dp_config.eps", 0), ("dp_config.max_weight", 0), ("dp_config.min_weight", 0), ("server_config.optimizer_config.type", "sgd"), ("server_config.optimizer_config.lr", 1.0), ("server_config.optimizer_config.amsgrad", False), ("server_config.annealing_config.type", "step_lr"), ("server_config.annealing_config.step_interval", "epoch"), ("server_config.annealing_config.gamma", 1.0), ("server_config.annealing_config.step_size", 100), ] for (key, default) in props: properties[key] = config.lookup(key, default) # Log the properties dictionary into AzureML run = Run.get_context() for k in properties: run.log(k, properties[k]) def run_worker(model_path, config, task, data_path, local_rank, backend): """Spawn worker object that lives throughout NCCL/GLOO thread. Args: model_path (str): path to the pretrained model. config (dict): dictionary containing parameters. task (str): what task to solve, must be a folder of :code:`experiments`. data_path (str): path to data. local_rank (int): the rank of the NCCL/GLOO thread. """ model_config = config["model_config"] server_config = config["server_config"] client_config = config["client_config"] # Backend initialization WORLD_RANK = federated.rank() LOCAL_RANK = federated.local_rank() print_rank(f"Backend: {backend}") dist.init_process_group(backend=backend, init_method=None, rank=WORLD_RANK, world_size=federated.size()) # Assign NCCL thread to a specific GPU if torch.cuda.is_available(): print_rank(f"Assigning worker to GPU {LOCAL_RANK}") device = torch.device("cuda:{}".format(LOCAL_RANK)) torch.cuda.set_device(device) # Make the Model to distribute to workers model = make_model(model_config) # Get evaluation datasets val_dataset = get_dataset(data_path, config, task, mode="val", test_only=True) test_dataset = get_dataset(data_path, config, task, mode="test", test_only=True) # Create list of clients for test/val -- Server need the indexes and Worker the clients list val_clients = list(make_eval_clients(val_dataset, config)) test_clients = list(make_eval_clients(test_dataset, config)) # pre-cache the training data and capture the number of clients for sampling num_clients = Client.get_train_dataset(data_path, config, task) config["server_config"]["data_config"]["num_clients"] = num_clients # Instantiate the Server object on the first thread if WORLD_RANK == 0: single_worker = None if federated.size() == 1: # For a single-GPU/CPU execution using NCCL, Server and Worker are instantiated in the same GPU. single_worker = federated.Worker(model=model, data_path=data_path, do_profiling=client_config.get("do_profiling", False), val_clients=val_clients, test_clients=test_clients, val_dataset = val_dataset, test_dataset = test_dataset, config= config) single_worker.run() try: print_rank('Server data preparation') if 'train' in config['server_config']['data_config']: server_train_dataloader = make_train_dataloader(config['server_config']['data_config']['train'], data_path, task=task, clientx=None) else: server_train_dataloader = None idx_val_clients = list(range(len(val_clients))) # Generates indexes for val clients idx_test_clients = list(range(len(test_clients))) # Generates indexes for test clients print_rank("Prepared the dataloaders") # Create the optimizer on the server optimizer = make_optimizer(server_config["optimizer_config"], model) # Load a model that's already trained best_trained_model = find_pretrained_model(model_path, model_config) if best_trained_model is not None: model_state_dict = torch.load(best_trained_model, map_location=None if torch.cuda.is_available() else torch.device("cpu")) model.load_state_dict(model_state_dict) server_type = server_config["type"] server_setup = select_server(server_type) # Return the server class server = server_setup( num_clients=config['server_config']['data_config']["num_clients"], model=model, optimizer=optimizer, ss_scheduler=None, data_path=data_path, model_path=model_path, server_train_dataloader=server_train_dataloader, config=config, idx_val_clients=idx_val_clients, idx_test_clients=idx_test_clients, single_worker=single_worker, ) log_run_properties(config) except Exception as e: # Be sure the other workers are shut down. server.terminate_workers() raise e print_rank("Launching server") server.run() else: # Instantiate client-processing Worker on remaining threads print_rank("Worker on node {}: process started".format(WORLD_RANK)) worker = federated.Worker( model=model, data_path=data_path, do_profiling=client_config.get("do_profiling", False), val_clients=val_clients, test_clients=test_clients, val_dataset = val_dataset, test_dataset = test_dataset, config= config, ) worker.run() if __name__ == "__main__": # Parse command-line arguments parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-outputPath") parser.add_argument("-dataPath", default=None) parser.add_argument("-task", default=None, help="Define the task for the run") parser.add_argument("-backend", default=None, help="Define the communication protocol") parser.add_argument("-num_skip_decoding", default=-1, type=int, help="Skip decoding in unsupervised learning mode") parser.add_argument("--local_rank", default=-1, type=int) args = parser.parse_args() data_path = args.dataPath task = args.task local_rank = args.local_rank assert args.backend in ['nccl','gloo'], f"Backend {args.backend} not recognized, please select nccl or gloo" backend = args.backend # The mount point can also be retrieved from input_datasets of the run context if data_path is None: data_path = Run.get_context().input_datasets["input"] print("The data can be found here: ", data_path) # Update the model path for the sake of AzureML id = Run.get_context().id experiment_name = "-".join(id.split("-")[-4:-2]) experiment_root = os.path.join(args.outputPath, experiment_name) os.makedirs(experiment_root, exist_ok=True) model_path = os.path.join(experiment_root, "models") log_path = os.path.join(experiment_root, "log") os.makedirs(model_path, exist_ok=True) os.makedirs(log_path, exist_ok=True) # Make a copy of the config file into the output folder, for future reference cfg_out = os.path.join(experiment_root, "FLUTE_config.yaml") if local_rank <= 0: shutil.copyfile(args.config, cfg_out) # Initialize logging init_logging(log_path, loglevel=logging.INFO) with open(args.config) as f: cfg_dict = yaml.safe_load(f) config = FLUTEConfig.from_dict(cfg_dict) config["data_path"] = data_path config["output_path"] = args.outputPath config["model_path"]= model_path config["experiment_name"] = experiment_name config["client_config"]["task"] = task config["server_config"]["task"] = task config.validate() # Instantiate either Server or Worker on the thread run_worker(model_path, config, task, data_path, local_rank, backend) ================================================ FILE: experiments/__init__.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch from utils import print_rank, print_cuda_stats, to_device from importlib.machinery import SourceFileLoader def make_model(model_config, dataloader_type=None, input_dim=-1, output_dim=-1): print('Preparing model .. Initializing') try: dir = "./"+ str(model_config["model_folder"]) model_class = model_config["model_type"] loader = SourceFileLoader(model_class,dir).load_module() model_type = getattr(loader,model_class ) except: raise ValueError("{} model not found, make sure to indicate the model path in the .yaml file".format(model_config["type"])) model = model_type(model_config) print(model) if not "weight_init" in model_config or model_config["weight_init"] == "default": print_rank("initialize model with default settings") pass elif model_config["weight_init"] == "xavier_normal": print_rank("initialize model with xavier_normal") for p in model.parameters(): if p.dim() > 1: # weight torch.nn.init.xavier_normal_(p.data) elif p.dim() == 1: # bias p.data.zero_() for m in model.modules(): if isinstance(m, (torch.nn.Embedding, torch.nn.LayerNorm, torch.nn.BatchNorm2d)): m.reset_parameters() else: return ValueError("{} not supported".format(model_config["weight_init"])) print_rank("trying to move the model to GPU") model = to_device(model) print_rank("model: {}".format(model)) print_cuda_stats() return model ================================================ FILE: experiments/classif_cnn/.gitignore ================================================ utils/data *.hdf5 *.json ================================================ FILE: experiments/classif_cnn/README.md ================================================ # Simple example of a CNN on CIFAR-10 Our objective here is to bring a simple experiment from the Pytorch tutorials, more specifically the one in https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py, and convert it to FLUTE. Instructions on how to do this are given below. An adapted version of the tutorial above is provided in the `utils/centralized_training.py` script. ## Preparing the data In this experiment we are making use of the CIFAR10 Dataset from torchvision, initializated in `dataloaders/cifar_dataset.py`, which inhereits from the FLUTE base dataset class `core/dataset.py` ## Specifying the model Next, we prepare the model. The `model.py` file contains two classes: one is the `Net` class already contained in the original script, and the other, a class called `CNN` which effectively wraps `Net`. Importantly, the `CNN` class defines two methods: `loss` and `inference`; both perform forward steps and then perform additional computations, in particular, the former executes the loss' evaluation, and the latter the metrics' computation. The format of the inputs and outputs should be the same as in this example. ## Specifying dataset and dataloaders Inside the `dataloaders` folder, there are two files: `dataset.py` and `dataloader.py`. Both inherit from the base classes declared in `core` folder, that under the hood inhereit from Pytorch classes with same name. The dataset should be able to access all the data, and store it in the attributes `user_list`, `user_data`, `user_data_labels` and `num_samples` (user names, user features, user labels if the problem is supervised, and number of samples for each user, respectively). These attributes are required to have these exact names. Otherwise, it should also be able to access the examples of a specific user, which id is passed during initialization via the `user_idx` argument. The dataloader is simpler, and essentially just instantiates the dataset and creates batches with a specific format. ## Creating a config file All the parameters of the experiment are passed in a YAML file. A documented example is provided in `config.yaml`. ## Running the experiment Finally, to launch the experiment, it suffices to launch the `e2e_trainer.py` script using torch.distributed. ``` python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -dataPath experiments/classif_cnn/utils/data -outputPath scratch -config experiments/classif_cnn/config.yaml -task classif_cnn -backend gloo ``` The `dataPath`, `outputPath` and `config` arguments should just specify the respective files or folders, as in the example above -- in this case, a folder called `scratch` will be created containing logs and checkpoints. The task should be the name of the folder insider `experiments`. Following what is specified in the config file, the experiment will run for 2000 rounds, and during each of them 10 clients will be selected at random, each of whom has 50 samples. It is more or less the same, then, as the 2 epochs in the centralized training, except that clients are selected at random so we might not see all of them. ================================================ FILE: experiments/classif_cnn/config.yaml ================================================ # Basic configuration file for running classif_cnn example using torchvision CIFAR10 dataset. # Parameters needed to initialize the model model_config: model_type: CNN # class w/ `loss` and `inference` methods model_folder: experiments/classif_cnn/model.py # file containing class # Configuration for differential privacy dp_config: enable_local_dp: false # whether to enable user-level DP # Additional privacy metrics privacy_metrics_config: apply_metrics: false # cache data to compute additional metrics # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx) strategy: DGA # Determines all the server-side settings for training and evaluation rounds server_config: wantRL: false # whether to use RL-based meta-optimizers resume_from_checkpoint: false # restart from checkpoint if file exists do_profiling: false # run profiler and compute runtime metrics optimizer_config: # this is the optimizer used to update the model type: sgd lr: 1.0 annealing_config: # annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 50 # how many iterations between metric eval on val set rec_freq: 100 # how many iterations between metric eval on test set initial_val: true initial_rec: true max_iteration: 2000 # how many iterations in total num_clients_per_iteration: 10 # how many clients per iteration data_config: # where to get val and test data from val: batch_size: 10000 val_data: null # Assigned to null because dataset is being instantiated test: batch_size: 10000 test_data: null # Assigned to null because dataset is being instantiated type: model_optimization aggregate_median: softmax # how aggregations weights are computed initial_lr_client: 0.001 # learning rate used on client optimizer lr_decay_factor: 1.0 weight_train_loss: train_loss best_model_criterion: f1_score fall_back_to_best_model: false softmax_beta: 1.0 # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: do_profiling: false # run profiling and compute runtime metrics ignore_subtask: false data_config: # where to get training data from train: batch_size: 4 list_of_train_data: null # Assigned to null because dataset is being instantiated desired_max_samples: 50000 optimizer_config: # this is the optimizer used by the client type: sgd lr: 0.001 # this is overridden by `initial_lr_client` momentum: 0.9 type: optimization ================================================ FILE: experiments/classif_cnn/dataloaders/cifar_dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import time import torchvision import torchvision.transforms as transforms class CIFAR10: def __init__(self) : # Get training and testing data from torchvision transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) print('Processing training set...') self.trainset=_process(trainset, n_users=1000) print('Processing test set...') self.testset=_process(testset, n_users=200) def _process(dataset, n_users): '''Process a Torchvision dataset to expected format and save to disk''' # Split training data equally among all users total_samples = len(dataset) samples_per_user = total_samples // n_users assert total_samples % n_users == 0 # Function for getting a given user's data indices user_idxs = lambda user_id: slice(user_id * samples_per_user, (user_id + 1) * samples_per_user) # Convert training data to expected format print('Converting data to expected format...') start_time = time.time() data_dict = { # the data is expected to have this format 'users' : [f'{user_id:04d}' for user_id in range(n_users)], 'num_samples' : 10000 * [samples_per_user], 'user_data' : {f'{user_id:04d}': dataset.data[user_idxs(user_id)].tolist() for user_id in range(n_users)}, 'user_data_label': {f'{user_id:04d}': dataset.targets[user_idxs(user_id)] for user_id in range(n_users)}, } print(f'Finished converting data in {time.time() - start_time:.2f}s.') return data_dict ================================================ FILE: experiments/classif_cnn/dataloaders/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch from core.dataloader import BaseDataLoader from experiments.classif_cnn.dataloaders.dataset import Dataset class DataLoader(BaseDataLoader): def __init__(self, mode, num_workers=0, **kwargs): args = kwargs['args'] self.batch_size = args['batch_size'] dataset = Dataset( data=kwargs['data'], test_only=(not mode=='train'), user_idx=kwargs.get('user_idx', None), ) super().__init__( dataset, batch_size=self.batch_size, shuffle=(mode=='train'), num_workers=num_workers, collate_fn=self.collate_fn, ) def collate_fn(self, batch): x, y = list(zip(*batch)) return {'x': torch.tensor(x), 'y': torch.tensor(y)} ================================================ FILE: experiments/classif_cnn/dataloaders/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np from core.dataset import BaseDataset from experiments.classif_cnn.dataloaders.cifar_dataset import CIFAR10 class Dataset(BaseDataset): def __init__(self, data, test_only=False, user_idx=0, **kwargs): self.test_only = test_only self.user_idx = user_idx # Get all data self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only) if self.test_only: # combine all data into single array self.user = 'test_only' self.features = np.vstack([user_data for user_data in self.user_data.values()]) self.labels = np.hstack([user_label for user_label in self.user_data_label.values()]) else: # get a single user's data if user_idx is None: raise ValueError('in train mode, user_idx must be specified') self.user = self.user_list[user_idx] self.features = self.user_data[self.user] self.labels = self.user_data_label[self.user] def __getitem__(self, idx): return np.array(self.features[idx]).astype(np.float32).T, self.labels[idx] def __len__(self): return len(self.features) def load_data(self, data, test_only): '''Wrapper method to read/instantiate the dataset''' if data == None: dataset = CIFAR10() data = dataset.testset if test_only else dataset.trainset users = data['users'] features = data['user_data'] labels = data['user_data_label'] num_samples = data['num_samples'] return users, features, labels, num_samples ================================================ FILE: experiments/classif_cnn/model.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch from torch import nn from torch.nn import functional as F from sklearn.metrics import f1_score from core.model import BaseModel class Net(nn.Module): '''The standard PyTorch model we want to federate''' def __init__(self): super().__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) self.fc1 = nn.Linear(16 * 5 * 5, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) def forward(self, x): x = self.pool(F.relu(self.conv1(x))) x = self.pool(F.relu(self.conv2(x))) x = torch.flatten(x, 1) # flatten all dimensions except batch x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x class CNN(BaseModel): '''This is a PyTorch model with some extra methods''' def __init__(self, model_config): super().__init__() self.net = Net() def loss(self, input: torch.Tensor) -> torch.Tensor: '''Performs forward step and computes the loss''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) return F.cross_entropy(output, labels.long()) def inference(self, input): '''Performs forward step and computes metrics''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) n_samples = features.shape[0] accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item() f1 = f1_score(labels.cpu(), torch.argmax(output, dim=1).cpu(), average='micro') # NOTE: Only the keys 'output','acc' and 'batch_size' does not require # extra fields as 'value' and 'higher is better'. FLUTE requires this # format only for customized metrics. return {'output':output, 'acc': accuracy, 'batch_size': n_samples, \ 'f1_score': {'value':f1,'higher_is_better': True}} ================================================ FILE: experiments/classif_cnn/utils/centralized_training.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. '''Simple example of a CNN on CIFAR-10 This is adapted from the Pytorch tutorials. See https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py for more info. ''' import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F import torch.optim as optim # Parameters BATCH_SIZE = 4 N_EPOCHS = 2 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Create dataloaders transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2) # Define the model class Net(nn.Module): def __init__(self): super().__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) self.fc1 = nn.Linear(16 * 5 * 5, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) def forward(self, x): x = self.pool(F.relu(self.conv1(x))) x = self.pool(F.relu(self.conv2(x))) x = torch.flatten(x, 1) # flatten all dimensions except batch x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x # Instantiate model, loss and optimizer net = Net().to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) # Training loop for epoch in range(N_EPOCHS): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate(trainloader, 0): # Get the inputs; data is a list of [inputs, labels] inputs, labels = data[0].to(device), data[1].to(device) # Zero the parameter gradients optimizer.zero_grad() # Forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # Print statistics running_loss += loss.item() if i % 2000 == 1999: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 # Compute accuracy correct = 0 total = 0 with torch.no_grad(): for data in testloader: images, labels = data[0].to(device), data[1].to(device) outputs = net(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Accuracy of the network on the 10000 test images: %d %%' % ( 100 * correct / total)) ================================================ FILE: experiments/classif_cnn/utils/download_and_convert_data.py ================================================ import h5py import json import time import torchvision import torchvision.transforms as transforms import tqdm def _dump_dict_to_hdf5(data_dict: dict, hdf5_file: h5py.File): '''Dump dict with expected structure to HDF5 file''' hdf5_file.create_dataset('users', data=data_dict['users']) hdf5_file.create_dataset('num_samples', data=data_dict['num_samples']) # Store actual data in groups user_data_group = hdf5_file.create_group('user_data') for user, user_data in tqdm.tqdm(data_dict['user_data'].items()): user_subgroup = user_data_group.create_group(user) user_subgroup.create_dataset('x', data=user_data) user_data_label_group = hdf5_file.create_group('user_data_label') for user, user_data_label in tqdm.tqdm(data_dict['user_data_label'].items()): user_data_label_group.create_dataset(user, data=user_data_label) def _process_and_save_to_disk(dataset, n_users, file_format, output): '''Process a Torchvision dataset to expected format and save to disk''' # Split training data equally among all users total_samples = len(dataset) samples_per_user = total_samples // n_users assert total_samples % n_users == 0 # Function for getting a given user's data indices user_idxs = lambda user_id: slice(user_id * samples_per_user, (user_id + 1) * samples_per_user) # Convert training data to expected format print('Converting data to expected format...') start_time = time.time() data_dict = { # the data is expected to have this format 'users' : [f'{user_id:04d}' for user_id in range(n_users)], 'num_samples' : 10000 * [samples_per_user], 'user_data' : {f'{user_id:04d}': dataset.data[user_idxs(user_id)].tolist() for user_id in range(n_users)}, 'user_data_label': {f'{user_id:04d}': dataset.targets[user_idxs(user_id)] for user_id in range(n_users)}, } print(f'Finished converting data in {time.time() - start_time:.2f}s.') # Save training data to disk print('Saving data to disk...') start_time = time.time() if file_format == 'json': with open(output + '.json', 'w') as json_file: json.dump(data_dict, json_file) elif file_format == 'hdf5': with h5py.File(output + '.hdf5', 'w') as hdf5_file: _dump_dict_to_hdf5(data_dict=data_dict, hdf5_file=hdf5_file) else: raise ValueError('unknown format.') print(f'Finished saving data in {time.time() - start_time:.2f}s.') # Get training and testing data from torchvision transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) print('Processing training set...') _process_and_save_to_disk(trainset, n_users=1000, file_format='hdf5', output='./data/train_data') print('Processing test set...') _process_and_save_to_disk(testset, n_users=200, file_format='hdf5', output='./data/test_data') ================================================ FILE: experiments/cv/README.md ================================================ # Simple example of ResNet model using personalization Our objective here is to bring a simple experiment of Computer Vision task, and convert it to FLUTE using the personalization feature. Instructions on how to do this are given below. ## Preparing the data In this experiment we are making use of the CIFAR10 Dataset from torchvision, initializated in `data.py`, which is wrapped by FLUTE Base Dataset. ## Specifying the model Next, we prepare the model. The `model.py` file contains different classes than can be used for this experiment. However, for this example we are using the `ResNet` class . Importantly, the `ResNet` class inheeits from `Base Model` declared in `core/model.py` and defines two methods: `loss` and `inference`; both perform forward steps and then perform additional computations, in particular, the former executes the loss' evaluation, and the latter the metrics' computation. The format of the inputs and outputs should be the same as in this example. ## Specifying dataset and dataloaders Inside the `dataloaders` folder, there are two files: `dataset.py` and `dataloader.py`. Both inherit from the base classes declared in `core` folder, that under the hood inhereit from Pytorch classes with same name. The dataset should be able to access all the data, and store it in the attributes `user_list`, `user_data`, `user_data_labels` and `num_samples` (user names, user features, user labels if the problem is supervised, and number of samples for each user, respectively). These attributes are required to have these exact names. Otherwise, it should also be able to access the examples of a specific user, which id is passed during initialization via the `user_idx` argument. The dataloader is simpler, and essentially just instantiates the dataset and creates batches with a specific format. ## Creating a config file All the parameters of the experiment are passed in a YAML file. A documented example is provided in `config.yaml`. ## Running the experiment Finally, to launch the experiment, it suffices to launch the `e2e_trainer.py` script using torch.distributed. ``` python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -dataPath ./ -outputPath scratch -config experiments/classif_cnn/config.yaml -task cv -backend gloo ``` The `dataPath`, `outputPath` and `config` arguments should just specify the respective files or folders, as in the example above -- in this case, `dataPath` can be any path given that data is being downloaded on-the.fly. A folder called `scratch` will be created containing logs and checkpoints. The task should be the name of the folder insider `experiments`. ================================================ FILE: experiments/cv/config.yaml ================================================ model_config: model_type: resnet50 #vgg11 # class w/ `loss` and `inference` methods model_folder: experiments/cv/model.py # file containing class num_classes: 10 dp_config: enable_local_dp: false # whether to enable user-level DP privacy_metrics_config: apply_metrics: false # cache data to compute additional metrics strategy: DGA # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx) server_config: wantRL: false # whether to use RL-based meta-optimizers resume_from_checkpoint: false # restart from checkpoint if file exists do_profiling: false # run profiler and compute runtime metrics save_to_disk: false # save the updated dataset in disk optimizer_config: # this is the optimizer used to update the model type: adam lr: 0.001 annealing_config: # annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.00 step_size: 100 val_freq: 1000 # how many iterations between metric eval on val set rec_freq: 5 # how many iterations between metric eval on test set initial_val: False initial_rec: True max_iteration: 1000 # how many iterations in total num_clients_per_iteration: 10 # how many clients per iteration total_num_clients: 100 data_config: # where to get val and test data from val: batch_size: 128 val_data: null test: batch_size: 128 test_data: null type: personalization # Options: personalization | model_optimization aggregate_median: softmax # how aggregations weights are computed softmax_beta: 20.0 initial_lr_client: 1.0 # learning rate used on client optimizer lr_decay_factor: 1.0 weight_train_loss: train_loss best_model_criterion: loss fall_back_to_best_model: false client_config: do_profiling: false # run profiling and compute runtime metrics ignore_subtask: false convex_model_interp: 0.75 # This is specific to personalization server/client data_config: # where to get training data from train: batch_size: 128 list_of_train_data: null desired_max_samples: 50000 optimizer_config: # this is the optimizer used by the client type: sgd lr: 0.001 # this is overridden by `initial_lr_client` type: optimization ================================================ FILE: experiments/cv/data.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import logging import h5py import json import os import torchvision from torchvision import transforms import numpy as np from numpy.random import RandomState from utils import print_rank class DataPartitioner(object): """ Partitions a dataset into different chunks. """ def __init__(self, data, sizes=None, rnd=0, alpha=0, num_c=10, dataset=None, lab_distr=None, ratio=1, img_size=32, wantTrans=True): self.data = data self.dataset = dataset self.total_num= len(sizes) if sizes is not None else len(lab_distr) self.img_size= img_size self.wantTrans= wantTrans if lab_distr is not None: self.partitions, self.dat_stat = self.__use_fixed_lab_distr__(data, lab_distr, ratio, rnd, num_c) else: self.partitions, self.ratio, self.dat_stat, self.endat_size = self.__getDirichletData__(data, sizes, alpha, num_c, rnd) def get_lab_distr(self): return self.dat_stat def return_partition(self, partition, flag='data', is_train_set=True): if flag != 'data': return [self.data[idx][1] for idx in self.partitions[partition]] mean = [x / 255 for x in [125.3, 123.0, 113.9]] std = [x / 255 for x in [63.0, 62.1, 66.7]] if self.wantTrans: dc = {'resize': 0.5 if is_train_set else None, 'pad': None, 'crop': None, 'flip': False, 'rotate': (-180+2*int(partition*180/self.total_num), -180+2*int((partition+1)*180/self.total_num)) if is_train_set else \ (-180+2*int(partition*180/self.total_num)+2, -180+2*int(partition*180/self.total_num)+2), 'normalize': [mean, std]} else: dc = {'resize': None, 'pad': None, 'crop': None, 'flip': False, 'rotate': None, 'normalize': [mean, std]} transform = get_transform(transform=dc,img_size=self.img_size) return {'x': [transform(self.data[idx][0]).tolist() for idx in self.partitions[partition]]} def __use_fixed_lab_distr__(self, data, lab_distr, ratio, rnd, num_c): n_nets = [] idx_batch = [] labelList = np.array(data.targets) rann = RandomState(rnd) # Find where all labels are label_dict={lab: np.where(labelList == lab)[0] for lab in range(num_c)} # Process the prefixed label distributions one by one for lab_indices in list(lab_distr.keys())[:-1]: net_dataidx_map = {} for lab, num in lab_distr[lab_indices].items(): len_k = len(label_dict[lab]) idx_k = label_dict[lab][:min(int(num*ratio), len_k)] label_dict[lab] = label_dict[lab][min(int(num*ratio), len_k):] if len(idx_k)>0: net_dataidx_map[lab] = list(idx_k) n_nets.append(net_dataidx_map) net_dataidx_map = {} for lab, idx_k in label_dict.items(): if len(idx_k)>0: net_dataidx_map[lab] = idx_k n_nets.append(net_dataidx_map) for i, lab_indices in enumerate(n_nets): idx_batch.append([item for sublist in lab_indices.values() for item in sublist]) net_cls_counts = {} for net_i, dataidx in enumerate(idx_batch): unq, unq_cnt = np.unique(labelList[dataidx], return_counts=True) tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))} net_cls_counts[net_i] = tmp print_rank('Data statistics: %s' % str(net_cls_counts), loglevel=logging.DEBUG) if 0: count=0 tot_count={i:0 for i in range(10)} for _, client in net_cls_counts.items(): for lab, num in client.items(): tot_count[lab]+=num count+=num print('Debugging:', tot_count, count) return idx_batch, net_cls_counts # Getting this function from FedML -- 02-17-22 def __getDirichletData__(self, data, psizes, alpha, num_c, rnd): n_nets = len(psizes) K = num_c labelList = np.array(data.targets) min_size = 0 N = len(labelList) rann = RandomState(rnd) net_dataidx_map = {} while min_size < K: idx_batch = [[] for _ in range(n_nets)] # for each class in the dataset for k in range(K): idx_k = np.where(labelList == k)[0] rann.shuffle(idx_k) proportions = rann.dirichlet(np.repeat(alpha, n_nets)) ## Balance proportions = np.array([p * (len(idx_j) < N / n_nets) for p, idx_j in zip(proportions, idx_batch)]) proportions = proportions / proportions.sum() proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1] idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))] min_size = min([len(idx_j) for idx_j in idx_batch]) for j in range(n_nets): rann.shuffle(idx_batch[j]) net_dataidx_map[j] = idx_batch[j] net_cls_counts = {} for net_i, dataidx in net_dataidx_map.items(): unq, unq_cnt = np.unique(labelList[dataidx], return_counts=True) tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))} net_cls_counts[net_i] = tmp local_sizes = [] for i in range(n_nets): local_sizes.append(len(net_dataidx_map[i])) local_sizes = np.array(local_sizes) weights = local_sizes / np.sum(local_sizes) print_rank('Data statistics: %s' % str(net_cls_counts), loglevel=logging.DEBUG) print_rank('Data ratio: %s' % str(weights), loglevel=logging.DEBUG) if 0: count=0 tot_count={i:0 for i in range(10)} for _, client in net_cls_counts.items(): for lab, num in client.items(): tot_count[lab]+=num count+=num print('Debugging:', tot_count, count) return idx_batch, weights, net_cls_counts, np.sum(local_sizes) def partition_dataset(rnd, img_size, image, total_num_clients, image_path, alpha, wantTransform): partition_sizes = [1.0/total_num_clients for _ in range(total_num_clients)] if image == 'cifar': trainset = torchvision.datasets.CIFAR10( root=os.path.join(image_path, image), train=True, download=True, transform=None) train_partition = DataPartitioner(trainset, partition_sizes, rnd, alpha=alpha, num_c=10, img_size=img_size, wantTrans=wantTransform) testset = torchvision.datasets.CIFAR10( root=os.path.join(image_path, image), train=False, download=True, transform=None) if 0: lab_distr= train_partition.get_lab_distr() test_partition = DataPartitioner(testset, lab_distr=lab_distr, rnd=rnd, ratio=0.2, num_c=10, img_size=img_size, wantTrans=wantTransform) else: test_partition = DataPartitioner(testset, partition_sizes, rnd, alpha=alpha, num_c=10, img_size=img_size, wantTrans=wantTransform) elif image == 'cifar100': trainset = torchvision.datasets.CIFAR100( root=os.path.join(image_path, image), train=True, download=True, transform=transform_train) # NOTE: Is this working? train_partition = DataPartitioner(trainset, partition_sizes, rnd, alpha=alpha, num_c=100) testset = torchvision.datasets.CIFAR100( root=os.path.join(image_path, image), train=False, download=True, transform=transform_test) test_partition = DataPartitioner(testset, partition_sizes, rnd, alpha=alpha, num_c=100) return train_partition, test_partition # Setup all necessary image datasets for training def prepare_dataset(rnd=2020, img_size=40, image='cifar', total_num_clients=100, image_path="./", alpha= 1.0, wantTransform=False, save_to_disk=False): train_partition, test_partition = partition_dataset(rnd=rnd, img_size=img_size, image=image, total_num_clients=total_num_clients, image_path=image_path, alpha=alpha, wantTransform= wantTransform) datasets = ["train_dataset.hdf5", "test_dataset.hdf5"] print_rank('Processing {}... '.format(datasets), loglevel=logging.DEBUG) output = [_process_and_save_to_disk(train_partition if set == "train_dataset.hdf5" else test_partition, save_to_disk, file_format= set.split('.')[-1], output=set, is_train_set=True if set == "train_dataset.hdf5" else False) for set in datasets] return output[0], output[1] def _dump_dict_to_hdf5(data_dict: dict, hdf5_file: h5py.File): '''Dump dict with expected structure to HDF5 file''' hdf5_file.create_dataset('users', data=data_dict['users']) hdf5_file.create_dataset('num_samples', data=data_dict['num_samples']) # Store actual data in groups user_data_group = hdf5_file.create_group('user_data') for user, user_data in data_dict['user_data']['x'].items(): user_subgroup = user_data_group.create_group(user) user_subgroup.create_dataset('x', data=user_data) user_data_label_group = hdf5_file.create_group('user_data_label') for user, user_data_label in data_dict['user_data_label'].items(): user_data_label_group.create_dataset(user, data=user_data_label) def _process_and_save_to_disk(dataset, save_to_disk, file_format, output, is_train_set=True): '''Process a Torchvision dataset to expected format and save to disk''' n_users = len(dataset.partitions) # Convert training data to expected format print_rank('Converting data to expected format...', loglevel=logging.DEBUG) data_dict = { 'users': [f'{user_id:04d}' for user_id in range(n_users)], 'num_samples': [len(dataset.partitions[user_id]) for user_id in range(n_users)], 'user_data': {f'{user_id:04d}': dataset.return_partition(user_id, 'data', is_train_set) for user_id in range(n_users)}, 'user_data_label': {f'{user_id:04d}': dataset.return_partition(user_id, 'labels', is_train_set) for user_id in range(n_users)}, } # Save training data to disk print_rank('Saving data to disk...', loglevel=logging.DEBUG) if save_to_disk: if file_format == 'json': outfile =output + '.json' with open(outfile, 'w') as json_file: json.dump(data_dict, json_file) elif file_format == 'hdf5': outfile =output + '.hdf5' with h5py.File(outfile, 'w') as hdf5_file: _dump_dict_to_hdf5(data_dict=data_dict, hdf5_file=hdf5_file) else: raise ValueError('unknown format.') print_rank('Finished saving data...{}'.format(outfile), loglevel=logging.DEBUG) else: outfile=data_dict return outfile def get_transform(transform, img_size=32): """Unpack transformations and apply to train or test splits""" transform_list = [transforms.ToTensor()] # resize if transform['resize'] is not None: transform_list.append(transforms.RandomResizedCrop(img_size, scale=(transform['resize'], 2*transform['resize']))) transform_list.append(torchvision.transforms.Pad(4)) else: transform_list.append(transforms.RandomCrop(img_size, padding=4)) #transform_list.append(transforms.Resize(img_size)) # padding if transform['pad'] is not None: transform_list.append(transforms.Pad(transform['pad'])) # crop if transform['crop'] is not None: transform_list.append(transforms.RandomResizedCrop(transform['crop'])) if transform['rotate'] is not None: transform_list.append(transforms.RandomRotation(transform['rotate'])) # flips if transform['flip']: transform_list.append(transforms.RandomHorizontalFlip()) transform_list.append(transforms.RandomVerticalFlip()) # normalization if transform['normalize'] is not None: transform_list.append(transforms.Normalize(mean=transform['normalize'][0], std=transform['normalize'][1])) return transforms.Compose(transform_list) ================================================ FILE: experiments/cv/dataloaders/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch import numpy as np from core.dataloader import BaseDataLoader from experiments.cv.dataloaders.dataset import Dataset class DataLoader(BaseDataLoader): def __init__(self, mode, num_workers=0, **kwargs): args = kwargs['args'] self.batch_size = args['batch_size'] dataset = Dataset( data=kwargs['data'], test_only=(not mode=='train'), user_idx=kwargs.get('user_idx', 0), ) super().__init__( dataset, batch_size=self.batch_size, shuffle=(mode=='train'), num_workers=num_workers, collate_fn=self.collate_fn, ) def collate_fn(self, batch): x, y = list(zip(*batch)) return {'x': torch.tensor(np.array(x)), 'y': torch.tensor(np.array(y)).long()} ================================================ FILE: experiments/cv/dataloaders/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np from core.dataset import BaseDataset from experiments.cv.data import prepare_dataset class Dataset(BaseDataset): def __init__(self, data, test_only=False, user_idx=0, **kwargs): self.test_only = test_only self.user_idx = user_idx # Get all data self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only) if self.test_only: # combine all data into single array self.user = 'test_only' self.features = np.vstack([user_data['x'] for user_data in self.user_data.values()]) self.labels = np.hstack(list(self.user_data_label.values())) else: # get a single user's data if user_idx is None: raise ValueError('in train mode, user_idx must be specified') self.user = self.user_list[user_idx] self.features = np.vstack([user_data['x'] for user_data in self.user_data.values()]) self.labels = np.hstack(list(self.user_data_label.values())) def __getitem__(self, idx): return self.features[idx].astype(np.float32).T, self.labels[idx] def __len__(self): return len(self.features) def load_data(self, data, test_only): '''Download or load data from disk/memory. The `data` argument can be either the path to the JSON or HDF5 file that contains the expected dictionary, or the actual dictionary. In case data cannot be loaded, will be downloaded through `prepare_dataset` method.''' if data == None: training_dataset, test_dataset = prepare_dataset(rnd=2020, img_size=40, image='cifar', total_num_clients=100, image_path="./", save_to_disk= False, alpha= 1.0, wantTransform= False) data = test_dataset if test_only else training_dataset users = data['users'] features = data['user_data'] labels = data['user_data_label'] num_samples = data['num_samples'] return users, features, labels, num_samples ================================================ FILE: experiments/cv/model.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. ''' Modified from https://github.com/pytorch/vision.git The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision. ''' import torch as T import torch.nn as nn import numpy as np import logging logging.basicConfig(format='%(levelname)s - %(message)s', level=logging.DEBUG) from torch import Tensor from torch.utils.model_zoo import load_url as load_state_dict_from_url from typing import Type, Any, Callable, Union, List, Optional from core.model import BaseModel __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2', 'wide_resnet101_2'] model_urls = { 'resnet18': 'https://download.pytorch.org/models/resnet18-f37072fd.pth', 'resnet34': 'https://download.pytorch.org/models/resnet34-b627a593.pth', 'resnet50': 'https://download.pytorch.org/models/resnet50-0676ba61.pth', 'resnet101': 'https://download.pytorch.org/models/resnet101-63fe2227.pth', 'resnet152': 'https://download.pytorch.org/models/resnet152-394f9c45.pth', 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth', 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth', 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth', 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth', } def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d: """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, groups=groups, bias=False, dilation=dilation) def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d: """1x1 convolution""" return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) class BasicBlock(nn.Module): expansion: int = 1 def __init__( self, inplanes: int, planes: int, stride: int = 1, downsample: Optional[nn.Module] = None, groups: int = 1, base_width: int = 64, dilation: int = 1, norm_layer: Optional[Callable[..., nn.Module]] = None ) -> None: super(BasicBlock, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d if groups != 1 or base_width != 64: raise ValueError('BasicBlock only supports groups=1 and base_width=64') if dilation > 1: raise NotImplementedError("Dilation > 1 not supported in BasicBlock") # Both self.conv1 and self.downsample layers downsample the input when stride != 1 self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = norm_layer(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = norm_layer(planes) self.downsample = downsample self.stride = stride def forward(self, x: Tensor) -> Tensor: identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class Bottleneck(nn.Module): # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2) # while original implementation places the stride at the first 1x1 convolution(self.conv1) # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385. # This variant is also known as ResNet V1.5 and improves accuracy according to # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch. expansion: int = 4 def __init__( self, inplanes: int, planes: int, stride: int = 1, downsample: Optional[nn.Module] = None, groups: int = 1, base_width: int = 64, dilation: int = 1, norm_layer: Optional[Callable[..., nn.Module]] = None ) -> None: super(Bottleneck, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d width = int(planes * (base_width / 64.)) * groups # Both self.conv2 and self.downsample layers downsample the input when stride != 1 self.conv1 = conv1x1(inplanes, width) self.bn1 = norm_layer(width) self.conv2 = conv3x3(width, width, stride, groups, dilation) self.bn2 = norm_layer(width) self.conv3 = conv1x1(width, planes * self.expansion) self.bn3 = norm_layer(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x: Tensor) -> Tensor: identity = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: identity = self.downsample(x) out += identity out = self.relu(out) return out class ResNet(BaseModel): def __init__( self, block: Type[Union[BasicBlock, Bottleneck]], layers: List[int], num_class: int = 1000, zero_init_residual: bool = False, groups: int = 1, width_per_group: int = 64, replace_stride_with_dilation: Optional[List[bool]] = None, norm_layer: Optional[Callable[..., nn.Module]] = None ) -> None: super(ResNet, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d self._norm_layer = norm_layer self.inplanes = 64 self.dilation = 1 if replace_stride_with_dilation is None: # each element in the tuple indicates if we should replace # the 2x2 stride with a dilated convolution instead replace_stride_with_dilation = [False, False, False] if len(replace_stride_with_dilation) != 3: raise ValueError("replace_stride_with_dilation should be None " "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) self.groups = groups self.base_width = width_per_group self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm_layer(self.inplanes) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]) self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]) self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2]) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(512 * block.expansion, num_class) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) # Zero-initialize the last BN in each residual branch, # so that the residual branch starts with zeros, and each residual block behaves like an identity. # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 if zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): nn.init.constant_(m.bn3.weight, 0) # type: ignore[arg-type] elif isinstance(m, BasicBlock): nn.init.constant_(m.bn2.weight, 0) # type: ignore[arg-type] def _make_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int, stride: int = 1, dilate: bool = False) -> nn.Sequential: norm_layer = self._norm_layer downsample = None previous_dilation = self.dilation if dilate: self.dilation *= stride stride = 1 if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( conv1x1(self.inplanes, planes * block.expansion, stride), norm_layer(planes * block.expansion), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer)) self.inplanes = planes * block.expansion for _ in range(1, blocks): layers.append(block(self.inplanes, planes, groups=self.groups, base_width=self.base_width, dilation=self.dilation, norm_layer=norm_layer)) return nn.Sequential(*layers) def forward(self, inputs): inp = inputs['x'].cuda() if T.cuda.is_available() else inputs['x'] x = self.conv1(T.transpose(inp, 1, 3)) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.avgpool(x) x = T.flatten(x, 1) x = self.fc(x) return x def get_logit(self, x = None, evalis = True, logmax=False): data, target = x if logmax: Softmax = T.nn.LogSoftmax(dim=1) else: Softmax = T.nn.Softmax(dim=1) data = data.cuda() if T.cuda.is_available() else data if evalis: self.eval() with T.no_grad(): # Run the forward pass output = self.forward(data) logits = Softmax(output) logits.detach_() else: self.train() output = self.forward(data) logits = Softmax(output) loss = 1 return logits.cpu(), target.cpu(), loss def inference(self, inputs): targets = inputs['y'].cuda() if T.cuda.is_available() else inputs['y'] # Run the forward pass self.eval() output = self(inputs) output = T.nn.LogSoftmax(dim=1)(output) # accuracy accuracy = T.mean((T.argmax(output, dim=1) == targets).float()).item() output = {'probabilities': output.cpu().detach().numpy(), 'predictions': np.arange(0, targets.shape[0]), 'labels': targets.cpu().numpy()} return {'output':output, 'acc': accuracy, 'batch_size': targets.shape[0]} def loss(self, inputs): targets = inputs['y'].cuda() if T.cuda.is_available() else inputs['y'] # Run the forward pass self.train() output = self.forward(inputs) loss = T.nn.functional.cross_entropy(output, targets) return loss def copy_state_dict(self, state_dict): self.state_dict=state_dict.clone() def get_model(self): return self def _resnet( arch: str, block: Type[Union[BasicBlock, Bottleneck]], layers: List[int], pretrained: bool, progress: bool, **kwargs: Any ) -> ResNet: model = ResNet(block, layers, **kwargs) if pretrained: state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) # edit last layer state_dict['fc.weight'] = state_dict['fc.weight'][:kwargs['num_class']] state_dict['fc.bias'] = state_dict['fc.bias'][:kwargs['num_class']] model.load_state_dict(state_dict) return model def resnet18(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: r"""ResNet-18 model from `"Deep Residual Learning for Image Recognition" `_. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['num_class']= config['num_classes'] return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, **kwargs) def resnet34(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: r"""ResNet-34 model from `"Deep Residual Learning for Image Recognition" `_. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['num_class']= config['num_classes'] return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress, **kwargs) def resnet50(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: r"""ResNet-50 model from `"Deep Residual Learning for Image Recognition" `_. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['num_class']= config['num_classes'] return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs) def resnet101(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: r"""ResNet-101 model from `"Deep Residual Learning for Image Recognition" `_. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs) def resnet152(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: r"""ResNet-152 model from `"Deep Residual Learning for Image Recognition" `_. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['num_class']= config['num_classes'] return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress, **kwargs) def resnext50_32x4d(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: r"""ResNeXt-50 32x4d model from `"Aggregated Residual Transformation for Deep Neural Networks" `_. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['groups'] = 32 kwargs['width_per_group'] = 4 kwargs['num_class']= config['num_classes'] return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs) def resnext101_32x8d(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: r"""ResNeXt-101 32x8d model from `"Aggregated Residual Transformation for Deep Neural Networks" `_. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['groups'] = 32 kwargs['width_per_group'] = 8 kwargs['num_class']= config['num_classes'] return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs) def wide_resnet50_2(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: r"""Wide ResNet-50-2 model from `"Wide Residual Networks" `_. The model is the same as ResNet except for the bottleneck number of channels which is twice larger in every block. The number of channels in outer 1x1 convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 channels, and in Wide ResNet-50-2 has 2048-1024-2048. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['width_per_group'] = 64 * 2 kwargs['num_class']= config['num_classes'] return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs) def wide_resnet101_2(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet: r"""Wide ResNet-101-2 model from `"Wide Residual Networks" `_. The model is the same as ResNet except for the bottleneck number of channels which is twice larger in every block. The number of channels in outer 1x1 convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 channels, and in Wide ResNet-50-2 has 2048-1024-2048. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ kwargs['width_per_group'] = 64 * 2 kwargs['num_class']= config['num_classes'] return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs) ================================================ FILE: experiments/cv/model_vgg.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. ''' Modified from https://github.com/pytorch/vision.git The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision. ''' import math import torch as T import torch.nn as nn import numpy as np import logging logging.basicConfig(format='%(levelname)s - %(message)s', level=logging.DEBUG) __all__ = [ 'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19_bn', 'vgg19', ] class VGG(nn.Module): ''' VGG model ''' def __init__(self, vgg, num_class, topK_results=None): super(VGG, self).__init__() self.topK_results = num_class if topK_results is None else topK_results self.vgg = vgg self.classifier = nn.Sequential( nn.Dropout(), nn.Linear(512, 512), nn.ReLU(True), nn.Dropout(), nn.Linear(512, 512), nn.ReLU(True), nn.Linear(512, num_class), ) if 0: # Initialize weights for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) m.bias.data.zero_() def forward(self, inputs): inputs = inputs['x'].cuda() if T.cuda.is_available() else inputs['x'] x = self.vgg(inputs.view(-1,3,32,32)) x = T.flatten(x, 1) x = self.classifier(x) return x def loss(self, inputs): targets = inputs['y'].cuda() if T.cuda.is_available() else inputs['y'] # Run the forward pass output = self(inputs) loss = T.nn.functional.cross_entropy(output, targets) return loss def inference(self, inputs): targets = inputs['y'].cuda() if T.cuda.is_available() else inputs['y'] # Run the forward pass output = self(inputs) # accuracy accuracy = T.mean((T.argmax(output, dim=1) == targets).float()).item() output = {'probabilities': output.cpu().detach().numpy(), 'predictions': np.arange(0, targets.shape[0]), 'labels': targets.cpu().numpy()} return {'output':output, 'val_acc': accuracy, 'batch_size': targets.shape[0]} def get_logit(self, inputs = None, evalis = True, logmax=False): data, targets = inputs if logmax: Softmax = T.nn.LogSoftmax(dim=1) else: Softmax = T.nn.Softmax(dim=1) data = data.cuda() if T.cuda.is_available() else data if evalis: self.eval() with T.no_grad(): # Run the forward pass output = self.forward(data) logits = Softmax(output) else: self.train() output = self.forward(data) logits = Softmax(output) loss = T.nn.functional.cross_entropy(output, targets) return logits.cpu(), targets.cpu(), loss.cpu() def copy_state_dict(self, state_dict): self.state_dict=state_dict.clone() def set_eval(self): """ Bring the model into evaluation mode """ self.eval() def set_train(self): """ Bring the model into train mode """ self.train() def make_layers(cfg, n_channels=3, batch_norm=True): layers = [] in_channels = n_channels for v in cfg: if v == 'M': layers += [nn.MaxPool2d(kernel_size=2, stride=2)] else: conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) if batch_norm: layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] else: layers += [conv2d, nn.ReLU(inplace=True)] in_channels = v return nn.Sequential(*layers) cfg = { 'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], } def vgg11(config): """VGG 11-layer model (configuration "A")""" num_class = config['num_classes'] return VGG(make_layers(cfg['A'], batch_norm=False),num_class) def vgg11_bn(config): """VGG 11-layer model (configuration "A") with batch normalization""" num_class = config['num_classes'] return VGG(make_layers(cfg['A'], batch_norm=True),num_class) def vgg13(config): """VGG 13-layer model (configuration "B")""" num_class = config['num_classes'] return VGG(make_layers(cfg['B'], batch_norm=False),num_class) def vgg13_bn(config): """VGG 13-layer model (configuration "B") with batch normalization""" num_class=config['num_classes'] return VGG(make_layers(cfg['B'], batch_norm=True),num_class) def vgg16(config): """VGG 16-layer model (configuration "D")""" num_class = config['num_classes'] return VGG(make_layers(cfg['D'], batch_norm=False),num_class) def vgg16_bn(config): """VGG 16-layer model (configuration "D") with batch normalization""" num_class = config['num_classes'] return VGG(make_layers(cfg['D'], batch_norm=True),num_class) def vgg19(config): """VGG 19-layer model (configuration "E")""" num_class=config['num_classes'] return VGG(make_layers(cfg['E'], batch_norm=False),num_class) def vgg19_bn(config): """VGG 19-layer model (configuration 'E') with batch normalization""" num_class=config['num_classes'] return VGG(make_layers(cfg['E'], batch_norm=True),num_class) ================================================ FILE: experiments/cv/server.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. ''' In this file, we define the local server that lives inside the client. ''' from core.server import OptimizationServer class PersonalizationServer(OptimizationServer): def __init__(self, num_clients, model, optimizer, ss_scheduler, data_path, model_path, server_train_dataloader, config, idx_val_clients, idx_test_clients): """ Personalization Server. Customized routines for server can be included here. """ super().__init__(num_clients, model, optimizer, ss_scheduler, data_path, model_path, server_train_dataloader, config, idx_val_clients, idx_test_clients) ================================================ FILE: experiments/cv_cnn_femnist/README.md ================================================ ## FedML Benchmark ### Examples The example in this folder was taken from [FedML](https://github.com/FedML-AI/FedML/tree/master/python/examples/simulation/mpi_fedavg_datasets_and_models_example) repository on its release 0.7.300, using the configuration suggested on their [benchmarking results](https://doc.fedml.ai/simulation/benchmark/BENCHMARK_MPI.html) for MPI-Based Federated Learning (fastest on this version). ### Data FLUTE will automatically download the data used for this example, otherwise you can use the scripts provided [here](https://github.com/FedML-AI/FedML/tree/master/python/fedml/data) for each independent dataset in the FedML GitHub repository. ### Run If you downloaded the data manually, make sure that the variable `data_cache_dir` has been updated inside `preprocess.py`. Later, you can run the experiment as follows: ```code python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest -config ./experiments/cv_cnn_femnist/config.yaml -task cv_cnn_femnist -backend nccl ``` ### Results This comparison was carried out using Parrot (Simulator) on version 0.7.303 at commit ID [8f7f261f](https://github.com/FedML-AI/FedML/tree/8f7f261f44e58d0cb5a416b0d6fa270b42a91049). ``` _____________________________________________________________________________ | | FedML (MPI) - Fastest | FLUTE (NCCL) - Fastest | | Task | Acc | Time | GPU Mem | Acc | Time | GPU Mem | |--------------------|-----|----------|----------|-----|----------|-----------| | LR_MNIST | ~81 | 00:03:09 | ~3060 MB | ~81 | 00:01:35 | ~1060 MB | | CNN_FEMNIST | ~83 | 05:49:52 | ~5180 MB | ~83 | 00:08:22 | ~1770 MB | | RESNET_FEDCIFAR100 | ~34 | 15:55:36 | ~5530 MB | ~33 | 01:42:01 | ~1900 MB | | RNN_FEDSHAKESPEARE | ~57 | 06:46:21 | ~3690 MB | ~57 | 00:21:50 | ~1270 MB | ----------------------------------------------------------------------------- ``` ### FedML Configuration file In order to reproduce this experiment in FedML please use the setup below. ```yaml common_args: training_type: "simulation" random_seed: 0 data_args: dataset: "femnist" data_cache_dir: "~/fedml_data" partition_method: "hetero" partition_alpha: 0.5 model_args: model: "cnn" train_args: federated_optimizer: "FedAvg" client_id_list: "[]" client_num_in_total: 3400 client_num_per_round: 10 comm_round: 800 epochs: 1 batch_size: 20 client_optimizer: sgd learning_rate: 0.1 weight_decay: 0.001 validation_args: frequency_of_the_test: 50 device_args: worker_num: 10 using_gpu: true gpu_mapping_file: config/fedemnist_cnn/gpu_mapping.yaml gpu_mapping_key: mapping_default # [3, 3, 3, 2] comm_args: backend: "MPI" is_mobile: 0 ``` ================================================ FILE: experiments/cv_cnn_femnist/config.yaml ================================================ # Basic configuration file for running classif_cnn example using torchvision CIFAR10 dataset. # Parameters needed to initialize the model model_config: model_type: CNN # class w/ `loss` and `inference` methods model_folder: experiments/cv_cnn_femnist/model.py # file containing class # Configuration for differential privacy dp_config: enable_local_dp: false # whether to enable user-level DP # Additional privacy metrics privacy_metrics_config: apply_metrics: false # cache data to compute additional metrics # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx) strategy: FedAvg # Determines all the server-side settings for training and evaluation rounds server_config: wantRL: false # whether to use RL-based meta-optimizers resume_from_checkpoint: false # restart from checkpoint if file exists do_profiling: false # run profiler and compute runtime metrics optimizer_config: # this is the optimizer used to update the model type: sgd lr: 1.0 annealing_config: # annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 50000 # not executing validation on this experiment, only testing rec_freq: 50 # how many iterations between metric eval on test set initial_val: false initial_rec: false max_iteration: 800 # how many iterations in total num_clients_per_iteration: 10 # how many clients per iteration data_config: # where to get val and test data from val: batch_size: 20 val_data: null # Assigned to null because dataset is being instantiated test: batch_size: 20 test_data: null # Assigned to null because dataset is being instantiated type: model_optimization aggregate_median: softmax # how aggregations weights are computed initial_lr_client: 0.1 # learning rate used on client optimizer lr_decay_factor: 1.0 weight_train_loss: train_loss best_model_criterion: loss fall_back_to_best_model: false softmax_beta: 1.0 # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: do_profiling: false # run profiling and compute runtime metrics ignore_subtask: false data_config: # where to get training data from train: batch_size: 20 list_of_train_data: null # Assigned to null because dataset is being instantiated desired_max_samples: 5000 optimizer_config: # this is the optimizer used by the client type: sgd lr: 0.1 # this is overridden by `initial_lr_client` type: optimization ================================================ FILE: experiments/cv_cnn_femnist/dataloaders/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch import numpy as np from core.dataloader import BaseDataLoader from experiments.cv_cnn_femnist.dataloaders.dataset import Dataset class DataLoader(BaseDataLoader): def __init__(self, mode, num_workers=0, **kwargs): args = kwargs['args'] self.batch_size = args['batch_size'] dataset = Dataset( data=kwargs['data'], test_only=(not mode=='train'), user_idx=kwargs.get('user_idx', None), ) super().__init__( dataset, batch_size=self.batch_size, shuffle=(mode=='train'), num_workers=num_workers, collate_fn=self.collate_fn, ) def collate_fn(self, batch): x, y = list(zip(*batch)) x, y = np.array(x), np.array(y) return {'x': torch.tensor(x), 'y': torch.tensor(y)} ================================================ FILE: experiments/cv_cnn_femnist/dataloaders/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np from core.dataset import BaseDataset from experiments.cv_cnn_femnist.dataloaders.preprocess import FEMNIST class Dataset(BaseDataset): def __init__(self, data, test_only=False, user_idx=0, **kwargs): self.test_only = test_only self.user_idx = user_idx # Get all data self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only) if user_idx == -1: self.user = self.user_list self.features = np.vstack([user_data for user_data in self.user_data.values()]) self.labels = np.hstack([user_label for user_label in self.user_data_label.values()]) else: if self.test_only: # combine all data into single array self.user = 'test_only' self.features = np.vstack([user_data for user_data in self.user_data.values()]) self.labels = np.hstack([user_label for user_label in self.user_data_label.values()]) else: # get a single user's data if user_idx is None: raise ValueError('in train mode, user_idx must be specified') self.user = self.user_list[user_idx] self.features = self.user_data[self.user] self.labels = self.user_data_label[self.user] def __getitem__(self, idx): return np.array(self.features[idx]).astype(np.float32).T, self.labels[idx] def __len__(self): return len(self.features) def load_data(self, data, test_only): '''Wrapper method to read/instantiate the dataset''' if data == None: dataset = FEMNIST() data = dataset.testset if test_only else dataset.trainset users = data['users'] features = data['user_data'] labels = data['user_data_label'] num_samples = data['num_samples'] return users, features, labels, num_samples ================================================ FILE: experiments/cv_cnn_femnist/dataloaders/preprocess.py ================================================ import os import h5py import wget import tarfile data_cache_dir = "./data" DEFAULT_TRAIN_FILE = "fed_emnist_train.h5" DEFAULT_TEST_FILE = "fed_emnist_test.h5" ''' The FederatedEMNIST dataset is taken from FedML repository. For more information regarding this dataset, please refer to https://github.com/FedML-AI/FedML/tree/master/python/fedml/data/FederatedEMNIST. In order to download the data run the following commands: - wget --no-check-certificate --no-proxy https://fedml.s3-us-west-1.amazonaws.com/fed_emnist.tar.bz2 - tar -xvf fed_emnist.tar.bz2 ''' class FEMNIST: def __init__(self) : download_files(data_cache_dir) # Preprocess the dataset train_h5 = h5py.File(os.path.join(data_cache_dir,'femnist', DEFAULT_TRAIN_FILE), "r") test_h5 = h5py.File(os.path.join(data_cache_dir, 'femnist',DEFAULT_TEST_FILE), "r") test_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()} train_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()} for user in test_h5['examples'].keys(): test_dict['users'].append(user) test_dict['num_samples'].append(len(test_h5['examples'][user]['pixels'][()])) test_dict['user_data'][user] = test_h5['examples'][user]['pixels'][()] test_dict['user_data_label'][user] = test_h5['examples'][user]['label'][()] for user in train_h5['examples'].keys(): train_dict['users'].append(user) train_dict['num_samples'].append(len(train_h5['examples'][user]['pixels'][()])) train_dict['user_data'][user] = train_h5['examples'][user]['pixels'][()] train_dict['user_data_label'][user] = train_h5['examples'][user]['label'][()] print(" Dictionaries ready .. ") self.trainset, self.testset = train_dict, test_dict def download_files(data_cache_dir): URL = "https://fedml.s3-us-west-1.amazonaws.com/fed_emnist.tar.bz2" if not os.path.exists(data_cache_dir): os.makedirs(data_cache_dir) file_path = os.path.join(data_cache_dir,"fed_emnist.tar.bz2") # Download and decompress the file (if we haven't already) if not os.path.exists(file_path): wget.download(URL, out=file_path) file = tarfile.open(file_path) file.extractall(os.path.join(data_cache_dir,'femnist')) file.close() ================================================ FILE: experiments/cv_cnn_femnist/model.py ================================================ import torch from torch import nn from torch.nn import functional as F from core.model import BaseModel ''' The CNN_DropOut model is taken from FedML repository. For more information regarding this model, please refer to https://github.com/FedML-AI/FedML/blob/master/python/fedml/model/cv/cnn.py. ''' class CNN_DropOut(torch.nn.Module): """ Recommended model by "Adaptive Federated Optimization" (https://arxiv.org/pdf/2003.00295.pdf) Used for EMNIST experiments. When `only_digits=True`, the summary of returned model is ``` Model: _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= reshape (Reshape) (None, 28, 28, 1) 0 _________________________________________________________________ conv2d (Conv2D) (None, 26, 26, 32) 320 _________________________________________________________________ conv2d_1 (Conv2D) (None, 24, 24, 64) 18496 _________________________________________________________________ max_pooling2d (MaxPooling2D) (None, 12, 12, 64) 0 _________________________________________________________________ dropout (Dropout) (None, 12, 12, 64) 0 _________________________________________________________________ flatten (Flatten) (None, 9216) 0 _________________________________________________________________ dense (Dense) (None, 128) 1179776 _________________________________________________________________ dropout_1 (Dropout) (None, 128) 0 _________________________________________________________________ dense_1 (Dense) (None, 10) 1290 ================================================================= Total params: 1,199,882 Trainable params: 1,199,882 Non-trainable params: 0 ``` Args: only_digits: If True, uses a final layer with 10 outputs, for use with the digits only MNIST dataset (http://yann.lecun.com/exdb/mnist/). If False, uses 62 outputs for Federated Extended MNIST (FEMNIST) EMNIST: Extending MNIST to handwritten letters: https://arxiv.org/abs/1702.05373. Returns: A `torch.nn.Module`. """ def __init__(self, only_digits=True): super(CNN_DropOut, self).__init__() self.conv2d_1 = torch.nn.Conv2d(1, 32, kernel_size=3) self.max_pooling = nn.MaxPool2d(2, stride=2) self.conv2d_2 = torch.nn.Conv2d(32, 64, kernel_size=3) self.dropout_1 = nn.Dropout(0.25) self.flatten = nn.Flatten() self.linear_1 = nn.Linear(9216, 128) self.dropout_2 = nn.Dropout(0.5) self.linear_2 = nn.Linear(128, 10 if only_digits else 62) self.relu = nn.ReLU() # self.softmax = nn.Softmax(dim=1) def forward(self, x): x = torch.unsqueeze(x, 1) x = self.conv2d_1(x) x = self.relu(x) x = self.conv2d_2(x) x = self.relu(x) x = self.max_pooling(x) x = self.dropout_1(x) x = self.flatten(x) x = self.linear_1(x) x = self.relu(x) x = self.dropout_2(x) x = self.linear_2(x) # x = self.softmax(self.linear_2(x)) return x class CNN(BaseModel): '''This is a PyTorch model with some extra methods''' def __init__(self, model_config): super().__init__() self.net = CNN_DropOut(False) def loss(self, input: torch.Tensor) -> torch.Tensor: '''Performs forward step and computes the loss''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) criterion = nn.CrossEntropyLoss().to(device) return criterion(output, labels.long()) def inference(self, input): '''Performs forward step and computes metrics''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) n_samples = features.shape[0] accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item() return {'output':output, 'acc': accuracy, 'batch_size': n_samples} ================================================ FILE: experiments/cv_lr_mnist/README.md ================================================ ## FedML Benchmark ### Examples The example in this folder was taken from [FedML](https://github.com/FedML-AI/FedML/tree/master/python/examples/simulation/mpi_fedavg_datasets_and_models_example) repository on its release 0.7.300, using the configuration suggested on their [benchmarking results](https://doc.fedml.ai/simulation/benchmark/BENCHMARK_MPI.html) for MPI-Based Federated Learning (fastest on this version). ### Data FLUTE will automatically download the data used for this example, otherwise you can use the scripts provided [here](https://github.com/FedML-AI/FedML/tree/master/python/fedml/data) for each independent dataset in the FedML GitHub repository. ### Run If you downloaded the data manually, make sure that the variable `data_cache_dir` has been updated inside `preprocess.py`. Later, you can run the experiment as follows: ```code python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest -config ./experiments/cv_lr_mnist/config.yaml -task cv_lr_mnist -backend nccl ``` ### FedML Results This comparison was carried out using Parrot (Simulator) on version 0.7.303 at commit ID [8f7f261f](https://github.com/FedML-AI/FedML/tree/8f7f261f44e58d0cb5a416b0d6fa270b42a91049). ``` _____________________________________________________________________________ | | FedML (MPI) - Fastest | FLUTE (NCCL) - Fastest | | Task | Acc | Time | GPU Mem | Acc | Time | GPU Mem | |--------------------|-----|----------|----------|-----|----------|-----------| | LR_MNIST | ~81 | 00:03:09 | ~3060 MB | ~81 | 00:01:35 | ~1060 MB | | CNN_FEMNIST | ~83 | 05:49:52 | ~5180 MB | ~83 | 00:08:22 | ~1770 MB | | RESNET_FEDCIFAR100 | ~34 | 15:55:36 | ~5530 MB | ~33 | 01:42:01 | ~1900 MB | | RNN_FEDSHAKESPEARE | ~57 | 06:46:21 | ~3690 MB | ~57 | 00:21:50 | ~1270 MB | ----------------------------------------------------------------------------- ``` ### FedML Configuration file In order to reproduce this experiment in FedML please use the setup below. ```yaml common_args: training_type: "simulation" random_seed: 0 data_args: dataset: "mnist" data_cache_dir: ~/fedml_data partition_method: "hetero" partition_alpha: 0.5 model_args: model: "lr" train_args: federated_optimizer: "FedAvg" client_id_list: "[]" client_num_in_total: 1000 client_num_per_round: 10 comm_round: 100 epochs: 1 batch_size: 10 client_optimizer: sgd learning_rate: 0.03 weight_decay: 0.001 validation_args: frequency_of_the_test: 20 device_args: worker_num: 10 using_gpu: true gpu_mapping_file: config/fedemnist_cnn/gpu_mapping.yaml gpu_mapping_key: mapping_default # [3, 3, 3, 2] comm_args: backend: "MPI" is_mobile: 0 ``` ### Flower Results This comparison was carried out using Flower (Simulator) on version 1.0.0 at commit ID [4e7fad9](https://github.com/adap/flower/tree/4e7fad99389a5ee511730841b61f279e3359cb16). Showing that in some cases FLUTE can outperform 53x faster. ``` ________________________________________________ | | Flower (Ray) | FLUTE (NCCL/Gloo) | | | Acc | Time | Acc | Time | |--------|-----|-------------|-----|-------------| | CPU | ~80 | 00:30:14 | ~80 | 00:03:20 | | GPU 2x | ~80 | 01:21:44 | ~80 | 00:01:31 | | GPU 4x | ~79 | 00:56:45 | ~81 | 00:01:26 | ------------------------------------------------ ``` ### Flower Configuration file In order to reproduce this experiment in Flower please use the following patch [file](https://github.com/AnonymousQTHM31/FLUTE/blob/main/flower.patch) for the CPU setup. If you want to use multiple GPUs, follow the configuration suggested [here](https://github.com/adap/flower/issues/1415) ================================================ FILE: experiments/cv_lr_mnist/config.yaml ================================================ # Basic configuration file for running classif_cnn example using torchvision CIFAR10 dataset. # Parameters needed to initialize the model model_config: model_type: LR # class w/ `loss` and `inference` methods model_folder: experiments/cv_lr_mnist/model.py # file containing class input_dim: 784 output_dim: 10 # Configuration for differential privacy dp_config: enable_local_dp: false # whether to enable user-level DP # Additional privacy metrics privacy_metrics_config: apply_metrics: false # cache data to compute additional metrics # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx) strategy: FedAvg # Determines all the server-side settings for training and evaluation rounds server_config: wantRL: false # whether to use RL-based meta-optimizers resume_from_checkpoint: false # restart from checkpoint if file exists do_profiling: false # run profiler and compute runtime metrics optimizer_config: # this is the optimizer used to update the model type: sgd lr: 1.0 annealing_config: # annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 1000 # how many iterations between metric eval on val set rec_freq: 20 # how many iterations between metric eval on test set initial_val: false initial_rec: false max_iteration: 100 # how many iterations in total num_clients_per_iteration: 10 # how many clients per iteration data_config: # where to get val and test data from val: batch_size: 10 val_data: null # Assigned to null because dataset is being instantiated test: batch_size: 10 test_data: null # Assigned to null because dataset is being instantiated type: model_optimization aggregate_median: softmax # how aggregations weights are computed initial_lr_client: 0.03 # learning rate used on client optimizer lr_decay_factor: 1.0 weight_train_loss: train_loss best_model_criterion: loss fall_back_to_best_model: false softmax_beta: 1.0 # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: do_profiling: false # run profiling and compute runtime metrics ignore_subtask: false data_config: # where to get training data from train: batch_size: 10 list_of_train_data: null # Assigned to null because dataset is being instantiated desired_max_samples: 5000 optimizer_config: # this is the optimizer used by the client type: sgd lr: 0.03 # this is overridden by `initial_lr_client` type: optimization ================================================ FILE: experiments/cv_lr_mnist/dataloaders/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch import numpy as np from core.dataloader import BaseDataLoader from experiments.cv_lr_mnist.dataloaders.dataset import Dataset class DataLoader(BaseDataLoader): def __init__(self, mode, num_workers=0, **kwargs): args = kwargs['args'] self.batch_size = args['batch_size'] dataset = Dataset( data=kwargs['data'], test_only=(not mode=='train'), user_idx=kwargs.get('user_idx', None), ) super().__init__( dataset, batch_size=self.batch_size, shuffle=(mode=='train'), num_workers=num_workers, collate_fn=self.collate_fn, ) def collate_fn(self, batch): x, y = list(zip(*batch)) x, y = np.array(x), np.array(y) return {'x': torch.tensor(x), 'y': torch.tensor(y)} ================================================ FILE: experiments/cv_lr_mnist/dataloaders/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np from core.dataset import BaseDataset from experiments.cv_lr_mnist.dataloaders.preprocessing import MNIST class Dataset(BaseDataset): def __init__(self, data, test_only=False, user_idx=0, **kwargs): self.test_only = test_only self.user_idx = user_idx # Get all data self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only) if user_idx == -1: self.user = self.user_list self.features = np.vstack([user_data for user_data in self.user_data.values()]) self.labels = np.hstack([user_label for user_label in self.user_data_label.values()]) else: if self.test_only: # combine all data into single array self.user = 'test_only' self.features = np.vstack([user_data for user_data in self.user_data.values()]) self.labels = np.hstack([user_label for user_label in self.user_data_label.values()]) else: # get a single user's data if user_idx is None: raise ValueError('in train mode, user_idx must be specified') self.user = self.user_list[user_idx] self.features = self.user_data[self.user] self.labels = self.user_data_label[self.user] def __getitem__(self, idx): return np.array(self.features[idx]).astype(np.float32).T, self.labels[idx] def __len__(self): return len(self.features) def load_data(self, data, test_only): '''Wrapper method to read/instantiate the dataset''' if data == None: dataset = MNIST() data = dataset.testset if test_only else dataset.trainset users = data['users'] features = data['user_data'] labels = data['user_data_label'] num_samples = data['num_samples'] return users, features, labels, num_samples ================================================ FILE: experiments/cv_lr_mnist/dataloaders/preprocessing.py ================================================ import os import wget import zipfile import numpy as np import json FEDML_DATA_MNIST_URL = "https://fedcv.s3.us-west-1.amazonaws.com/MNIST.zip" data_cache_dir = "./data" ''' The MNIST dataset is taken from FedML repository. For more information regarding this dataset, please refer to https://github.com/FedML-AI/FedML/tree/master/python/fedml/data/MNIST. In order to download the data run the following commands: - wget --no-check-certificate --no-proxy https://fedcv.s3.us-west-1.amazonaws.com/MNIST.zip - unzip MNIST.zip ''' class MNIST: def __init__(self) : download_mnist(data_cache_dir) self.trainset, self.testset = read_data( train_data_dir = os.path.join(data_cache_dir,'MNIST','train'), test_data_dir= os.path.join(data_cache_dir,'MNIST','test'), ) print("Dictionaries ready ..") def download_mnist(data_cache_dir): if not os.path.exists(data_cache_dir): os.makedirs(data_cache_dir) file_path = os.path.join(data_cache_dir,"MNIST.zip") # Download the file (if we haven't already) if not os.path.exists(file_path): wget.download(FEDML_DATA_MNIST_URL, out=file_path) with zipfile.ZipFile(file_path, "r") as zip_ref: zip_ref.extractall(data_cache_dir) def read_data(train_data_dir, test_data_dir): train_files = os.listdir(train_data_dir) train_files = [f for f in train_files if f.endswith(".json")] for f in train_files: file_path = os.path.join(train_data_dir, f) with open(file_path, "r") as inf: train_data = json.load(inf) train_data['user_data_label'] = dict() for user in train_data['user_data']: train_data['user_data_label'][user] = train_data['user_data'][user]['y'] train_data['user_data'][user] = train_data['user_data'][user]['x'] test_files = os.listdir(test_data_dir) test_files = [f for f in test_files if f.endswith(".json")] for f in test_files: file_path = os.path.join(test_data_dir, f) with open(file_path, "r") as inf: test_data = json.load(inf) test_data['user_data_label'] = dict() for user in test_data['user_data']: test_data['user_data_label'][user] = test_data['user_data'][user]['y'] test_data['user_data'][user] = test_data['user_data'][user]['x'] return train_data, test_data ================================================ FILE: experiments/cv_lr_mnist/model.py ================================================ import torch from torch import nn from torch.nn import functional as F from core.model import BaseModel ''' The LogisticRegression model is taken from FedML repository. For more information regarding this model, please refer to https://github.com/FedML-AI/FedML/blob/master/python/fedml/model/linear/lr.py. ''' class LogisticRegression(torch.nn.Module): def __init__(self, input_dim, output_dim): super(LogisticRegression, self).__init__() self.linear = torch.nn.Linear(input_dim, output_dim) def forward(self, x): o = self.linear(x.view(-1,28*28)) outputs = torch.sigmoid(o) #outputs = torch.sigmoid(self.linear(x)) return outputs class LR(BaseModel): '''This is a PyTorch model with some extra methods''' def __init__(self, model_config): super().__init__() self.net = LogisticRegression(model_config['input_dim'], model_config['output_dim']) def loss(self, input: torch.Tensor) -> torch.Tensor: '''Performs forward step and computes the loss''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) criterion = nn.CrossEntropyLoss().to(device) return criterion(output, labels.long()) def inference(self, input): '''Performs forward step and computes metrics''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) n_samples = features.shape[0] accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item() return {'output':output, 'acc': accuracy, 'batch_size': n_samples} ================================================ FILE: experiments/cv_resnet_fedcifar100/README.md ================================================ ## FedML Benchmark ### Examples The example in this folder was taken from [FedML](https://github.com/FedML-AI/FedML/tree/master/python/examples/simulation/mpi_fedavg_datasets_and_models_example) repository on its release 0.7.300, using the configuration suggested on their [benchmarking results](https://doc.fedml.ai/simulation/benchmark/BENCHMARK_MPI.html) for MPI-Based Federated Learning (fastest on this version). ### Data FLUTE will automatically download the data used for this example, otherwise you can use the scripts provided [here](https://github.com/FedML-AI/FedML/tree/master/python/fedml/data) for each independent dataset in the FedML GitHub repository. ### Run If you downloaded the data manually, make sure that the variable `data_cache_dir` has been updated inside `preprocess.py`. Later, you can run the experiment as follows: ```code python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest -config ./experiments/cv_resnet_fedcifar100/config.yaml -task cv_resnet_fedcifar100 -backend nccl ``` ### Results This comparison was carried out using Parrot (Simulator) on version 0.7.303 at commit ID [8f7f261f](https://github.com/FedML-AI/FedML/tree/8f7f261f44e58d0cb5a416b0d6fa270b42a91049). ``` _____________________________________________________________________________ | | FedML (MPI) - Fastest | FLUTE (NCCL) - Fastest | | Task | Acc | Time | GPU Mem | Acc | Time | GPU Mem | |--------------------|-----|----------|----------|-----|----------|-----------| | LR_MNIST | ~81 | 00:03:09 | ~3060 MB | ~81 | 00:01:35 | ~1060 MB | | CNN_FEMNIST | ~83 | 05:49:52 | ~5180 MB | ~83 | 00:08:22 | ~1770 MB | | RESNET_FEDCIFAR100 | ~34 | 15:55:36 | ~5530 MB | ~33 | 01:42:01 | ~1900 MB | | RNN_FEDSHAKESPEARE | ~57 | 06:46:21 | ~3690 MB | ~57 | 00:21:50 | ~1270 MB | ----------------------------------------------------------------------------- ``` ### FedML Configuration file In order to reproduce this experiment in FedML please use the setup below. ```yaml common_args: training_type: "simulation" random_seed: 0 data_args: dataset: "fed_cifar100" data_cache_dir: ~/fedml_data partition_method: "hetero" partition_alpha: 0.5 model_args: model: "resnet18_gn" train_args: federated_optimizer: "FedAvg" client_id_list: "[]" client_num_in_total: 500 client_num_per_round: 10 comm_round: 4000 epochs: 1 batch_size: 20 client_optimizer: sgd learning_rate: 0.1 weight_decay: 0.001 validation_args: frequency_of_the_test: 50 device_args: worker_num: 10 using_gpu: true gpu_mapping_file: config/fedcifar100_resnet18/gpu_mapping.yaml gpu_mapping_key: mapping_default # [3, 3, 3, 2] comm_args: backend: "MPI" is_mobile: 0 ``` ================================================ FILE: experiments/cv_resnet_fedcifar100/config.yaml ================================================ # Basic configuration file for running classif_cnn example using torchvision CIFAR10 dataset. # Parameters needed to initialize the model model_config: model_type: RESNET # class w/ `loss` and `inference` methods model_folder: experiments/cv_resnet_fedcifar100/model.py # file containing class # Configuration for differential privacy dp_config: enable_local_dp: false # whether to enable user-level DP # Additional privacy metrics privacy_metrics_config: apply_metrics: false # cache data to compute additional metrics # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx) strategy: FedAvg # Determines all the server-side settings for training and evaluation rounds server_config: wantRL: false # whether to use RL-based meta-optimizers resume_from_checkpoint: false # restart from checkpoint if file exists do_profiling: false # run profiler and compute runtime metrics optimizer_config: # this is the optimizer used to update the model type: sgd lr: 1.0 annealing_config: # annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 50000 # how many iterations between metric eval on val set rec_freq: 50 # how many iterations between metric eval on test set initial_val: false initial_rec: false max_iteration: 4000 # how many iterations in total num_clients_per_iteration: 10 # how many clients per iteration data_config: # where to get val and test data from val: batch_size: 20 val_data: null # Assigned to null because dataset is being instantiated test: batch_size: 20 test_data: null # Assigned to null because dataset is being instantiated type: model_optimization aggregate_median: softmax # how aggregations weights are computed initial_lr_client: 0.1 # learning rate used on client optimizer lr_decay_factor: 1.0 weight_train_loss: train_loss best_model_criterion: loss fall_back_to_best_model: false softmax_beta: 1.0 # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: do_profiling: false # run profiling and compute runtime metrics ignore_subtask: false data_config: # where to get training data from train: batch_size: 20 list_of_train_data: null # Assigned to null because dataset is being instantiated desired_max_samples: 5000 optimizer_config: # this is the optimizer used by the client type: sgd lr: 0.1 # this is overridden by `initial_lr_client` type: optimization ================================================ FILE: experiments/cv_resnet_fedcifar100/dataloaders/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch import numpy as np from core.dataloader import BaseDataLoader from experiments.cv_resnet_fedcifar100.dataloaders.dataset import Dataset class DataLoader(BaseDataLoader): def __init__(self, mode, num_workers=0, **kwargs): args = kwargs['args'] self.batch_size = args['batch_size'] dataset = Dataset( data=kwargs['data'], test_only=(not mode=='train'), user_idx=kwargs.get('user_idx', None), ) super().__init__( dataset, batch_size=self.batch_size, shuffle=(mode=='train'), num_workers=num_workers, collate_fn=self.collate_fn, ) def collate_fn(self, batch): x, y = list(zip(*batch)) x, y = np.array(x), np.array(y) return {'x': torch.tensor(x), 'y': torch.tensor(y)} ================================================ FILE: experiments/cv_resnet_fedcifar100/dataloaders/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np from core.dataset import BaseDataset from experiments.cv_resnet_fedcifar100.dataloaders.preprocessing import FEDCIFAR100 class Dataset(BaseDataset): def __init__(self, data, test_only=False, user_idx=0, **kwargs): self.test_only = test_only self.user_idx = user_idx # Get all data self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only) if user_idx == -1: self.user = 'test_only' self.features = np.vstack([user_data for user_data in self.user_data.values()]) self.labels = np.hstack([user_label for user_label in self.user_data_label.values()]) else: if self.test_only: # combine all data into single array self.user = 'test_only' self.features = np.vstack([user_data for user_data in self.user_data.values()]) self.labels = np.hstack([user_label for user_label in self.user_data_label.values()]) else: # get a single user's data if user_idx is None: raise ValueError('in train mode, user_idx must be specified') self.user = self.user_list[user_idx] self.features = self.user_data[self.user] self.labels = self.user_data_label[self.user] def __getitem__(self, idx): return np.array(self.features[idx]).astype(np.float32).T, self.labels[idx] def __len__(self): return len(self.features) def load_data(self, data, test_only): '''Wrapper method to read/instantiate the dataset''' if data == None: dataset = FEDCIFAR100() data = dataset.testset if test_only else dataset.trainset users = data['users'] features = data['user_data'] labels = data['user_data_label'] num_samples = data['num_samples'] return users, features, labels, num_samples ================================================ FILE: experiments/cv_resnet_fedcifar100/dataloaders/preprocessing.py ================================================ import os import wget import zipfile import tarfile import h5py data_cache_dir = "./data" DEFAULT_TRAIN_FILE = "fed_cifar100_train.h5" DEFAULT_TEST_FILE = "fed_cifar100_test.h5" ''' The FedCIFAR100 dataset is taken from FedML repository. For more information regarding this dataset, please refer to https://github.com/FedML-AI/FedML/tree/master/python/fedml/data/fed_cifar100. In order to download the data run the following commands: - wget --no-check-certificate --no-proxy https://fedml.s3-us-west-1.amazonaws.com/fed_cifar100.tar.bz2 - tar -xvf fed_cifar100.tar.bz2 ''' class FEDCIFAR100: def __init__(self) : download_files(data_cache_dir) # Preprocess datasets train_h5 = h5py.File(os.path.join(data_cache_dir,'fed_cifar100', DEFAULT_TRAIN_FILE), "r") test_h5 = h5py.File(os.path.join(data_cache_dir, 'fed_cifar100',DEFAULT_TEST_FILE), "r") test_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()} train_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()} for user in test_h5['examples'].keys(): test_dict['users'].append(user) test_dict['num_samples'].append(len(test_h5['examples'][user]['image'][()])) test_dict['user_data'][user] = test_h5['examples'][user]['image'][()] test_dict['user_data_label'][user] = test_h5['examples'][user]['label'][()] for user in train_h5['examples'].keys(): train_dict['users'].append(user) train_dict['num_samples'].append(len(train_h5['examples'][user]['image'][()])) train_dict['user_data'][user] = train_h5['examples'][user]['image'][()] train_dict['user_data_label'][user] = train_h5['examples'][user]['label'][()] print(" Dictionaries ready .. ") self.trainset, self.testset = train_dict, test_dict def download_files(data_cache_dir): URL = "https://fedml.s3-us-west-1.amazonaws.com/fed_cifar100.tar.bz2" if not os.path.exists(data_cache_dir): os.makedirs(data_cache_dir) file_path = os.path.join(data_cache_dir,"fed_cifar100.tar.bz2") # Download and decompress the file (if we haven't already) if not os.path.exists(file_path): wget.download(URL, out=file_path) file = tarfile.open(file_path) file.extractall(os.path.join(data_cache_dir,'fed_cifar100')) file.close() ================================================ FILE: experiments/cv_resnet_fedcifar100/group_normalization.py ================================================ import torch.nn.functional as F from torch.nn.modules.batchnorm import _BatchNorm """ This group normalization script was taken from FedML repository. For more information please refer to https://github.com/FedML-AI/FedML/blob/master/python/fedml/model/cv/group_normalization.py. Pytorch implementation of group normalization in https://arxiv.org/abs/1803.08494 (Following the PyTorch Style) """ def group_norm( input, group, running_mean, running_var, weight=None, bias=None, use_input_stats=True, momentum=0.1, eps=1e-5, ): """Applies Group Normalization for channels in the same group in each data sample in a batch. See :class:`~torch.nn.GroupNorm1d`, :class:`~torch.nn.GroupNorm2d`, :class:`~torch.nn.GroupNorm3d` for details. """ if not use_input_stats and (running_mean is None or running_var is None): raise ValueError( "Expected running_mean and running_var to be not None when use_input_stats=False" ) b, c = input.size(0), input.size(1) if weight is not None: weight = weight.repeat(b) if bias is not None: bias = bias.repeat(b) def _instance_norm( input, group, running_mean=None, running_var=None, weight=None, bias=None, use_input_stats=None, momentum=None, eps=None, ): # Repeat stored stats and affine transform params if necessary if running_mean is not None: running_mean_orig = running_mean running_mean = running_mean_orig.repeat(b) if running_var is not None: running_var_orig = running_var running_var = running_var_orig.repeat(b) # norm_shape = [1, b * c / group, group] # print(norm_shape) # Apply instance norm input_reshaped = input.contiguous().view( 1, int(b * c / group), group, *input.size()[2:] ) out = F.batch_norm( input_reshaped, running_mean, running_var, weight=weight, bias=bias, training=use_input_stats, momentum=momentum, eps=eps, ) # Reshape back if running_mean is not None: running_mean_orig.copy_( running_mean.view(b, int(c / group)).mean(0, keepdim=False) ) if running_var is not None: running_var_orig.copy_( running_var.view(b, int(c / group)).mean(0, keepdim=False) ) return out.view(b, c, *input.size()[2:]) return _instance_norm( input, group, running_mean=running_mean, running_var=running_var, weight=weight, bias=bias, use_input_stats=use_input_stats, momentum=momentum, eps=eps, ) class _GroupNorm(_BatchNorm): def __init__( self, num_features, num_groups=1, eps=1e-5, momentum=0.1, affine=False, track_running_stats=False, ): self.num_groups = num_groups self.track_running_stats = track_running_stats super(_GroupNorm, self).__init__( int(num_features / num_groups), eps, momentum, affine, track_running_stats ) def _check_input_dim(self, input): return NotImplemented def forward(self, input): self._check_input_dim(input) return group_norm( input, self.num_groups, self.running_mean, self.running_var, self.weight, self.bias, self.training or not self.track_running_stats, self.momentum, self.eps, ) class GroupNorm2d(_GroupNorm): r"""Applies Group Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper https://arxiv.org/pdf/1803.08494.pdf `Group Normalization`_ . Args: num_features: :math:`C` from an expected input of size :math:`(N, C, H, W)` num_groups: eps: a value added to the denominator for numerical stability. Default: 1e-5 momentum: the value used for the running_mean and running_var computation. Default: 0.1 affine: a boolean value that when set to ``True``, this module has learnable affine parameters. Default: ``True`` track_running_stats: a boolean value that when set to ``True``, this module tracks the running mean and variance, and when set to ``False``, this module does not track such statistics and always uses batch statistics in both training and eval modes. Default: ``False`` Shape: - Input: :math:`(N, C, H, W)` - Output: :math:`(N, C, H, W)` (same shape as input) Examples: >>> # Without Learnable Parameters >>> m = GroupNorm2d(100, 4) >>> # With Learnable Parameters >>> m = GroupNorm2d(100, 4, affine=True) >>> input = torch.randn(20, 100, 35, 45) >>> output = m(input) """ def _check_input_dim(self, input): if input.dim() != 4: raise ValueError("expected 4D input (got {}D input)".format(input.dim())) class GroupNorm3d(_GroupNorm): """ Assume the data format is (B, C, D, H, W) """ def _check_input_dim(self, input): if input.dim() != 5: raise ValueError("expected 5D input (got {}D input)".format(input.dim())) ================================================ FILE: experiments/cv_resnet_fedcifar100/model.py ================================================ import math import torch import torch.nn as nn import torch.utils.model_zoo as model_zoo from torch.nn import functional as F from experiments.cv_resnet_fedcifar100.group_normalization import GroupNorm2d from core.model import BaseModel ''' The ResNet models are taken from FedML repository. For more information regarding this model, please refer to https://github.com/FedML-AI/FedML/blob/master/python/fedml/model/cv/resnet_gn.py. ''' __all__ = ["ResNet", "resnet18", "resnet34", "resnet50", "resnet101", "resnet152"] model_urls = { "resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth", "resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth", "resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth", "resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth", "resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth", } def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d( in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False ) def norm2d(planes, num_channels_per_group=32): print("num_channels_per_group:{}".format(num_channels_per_group)) if num_channels_per_group > 0: return GroupNorm2d( planes, num_channels_per_group, affine=True, track_running_stats=False ) else: return nn.BatchNorm2d(planes) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, group_norm=0): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = norm2d(planes, group_norm) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = norm2d(planes, group_norm) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None, group_norm=0): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) self.bn1 = norm2d(planes, group_norm) self.conv2 = nn.Conv2d( planes, planes, kernel_size=3, stride=stride, padding=1, bias=False ) self.bn2 = norm2d(planes, group_norm) self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) self.bn3 = norm2d(planes * 4, group_norm) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class ResNet(nn.Module): def __init__(self, block, layers, num_classes=1000, group_norm=0): self.inplanes = 64 super(ResNet, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm2d(64, group_norm) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0], group_norm=group_norm) self.layer2 = self._make_layer( block, 128, layers[1], stride=2, group_norm=group_norm ) self.layer3 = self._make_layer( block, 256, layers[2], stride=2, group_norm=group_norm ) self.layer4 = self._make_layer( block, 512, layers[3], stride=2, group_norm=group_norm ) # self.avgpool = nn.AvgPool2d(7, stride=1) self.avgpool = nn.AvgPool2d(1) self.fc = nn.Linear(512 * block.expansion, num_classes) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2.0 / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() elif isinstance(m, GroupNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() for m in self.modules(): if isinstance(m, Bottleneck): m.bn3.weight.data.fill_(0) if isinstance(m, BasicBlock): m.bn2.weight.data.fill_(0) def _make_layer(self, block, planes, blocks, stride=1, group_norm=0): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d( self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False, ), norm2d(planes * block.expansion, group_norm), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample, group_norm)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes, group_norm=group_norm)) return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.avgpool(x) x = x.view(x.size(0), -1) x = self.fc(x) return x def resnet18(pretrained=False, **kwargs): """Constructs a ResNet-18 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls["resnet18"])) return model def resnet34(pretrained=False, **kwargs): """Constructs a ResNet-34 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls["resnet34"])) return model def resnet50(pretrained=False, **kwargs): """Constructs a ResNet-50 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls["resnet50"])) return model def resnet101(pretrained=False, **kwargs): """Constructs a ResNet-101 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls["resnet101"])) return model def resnet152(pretrained=False, **kwargs): """Constructs a ResNet-152 model. Args: pretrained (bool): If True, returns a model pre-trained on ImageNet """ model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) if pretrained: model.load_state_dict(model_zoo.load_url(model_urls["resnet152"])) return model class RESNET(BaseModel): '''This is a PyTorch model with some extra methods''' def __init__(self, model_config): super().__init__() self.net = resnet18() def loss(self, input: torch.Tensor) -> torch.Tensor: '''Performs forward step and computes the loss''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) return F.cross_entropy(output, labels.long()) def inference(self, input): '''Performs forward step and computes metrics''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) n_samples = features.shape[0] accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item() return {'output':output, 'acc': accuracy, 'batch_size': n_samples} ================================================ FILE: experiments/ecg_cnn/.gitignore ================================================ ./data ./raw_data *.hdf5 *.json ================================================ FILE: experiments/ecg_cnn/centralized_model.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "gather": { "logged": 1644332992250 } }, "outputs": [], "source": [ "# Example running CL taken from:\n", "# https://www.kaggle.com/polomarco/ecg-classification-cnn-lstm-attention-mechanism" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "gather": { "logged": 1644397182872 } }, "outputs": [], "source": [ "import csv\n", "import time\n", "\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "\n", "import torch\n", "import torch.nn as nn\n", "import matplotlib.pyplot as plt\n", "import torch.nn.functional as F\n", "from torch.utils.data import Dataset, DataLoader\n", "from torch.optim import AdamW, SGD\n", "from torch.optim.lr_scheduler import StepLR\n", "\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "gather": { "logged": 1644332993422 } }, "outputs": [], "source": [ "class Config:\n", " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " train_csv_path = './raw_data/mitbih_train.csv'\n", " test_csv_path = './raw_data/mitbih_test.csv'\n", " seed = 123\n", "config = Config" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, "source_hidden": false }, "nteract": { "transient": { "deleting": false } } }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "metadata": { "gather": { "logged": 1644332993546 } }, "outputs": [], "source": [ "class ECGDataset(Dataset):\n", "\n", " def __init__(self, df):\n", " self.df = df\n", " self.data_columns = self.df.columns[:-2].tolist()\n", "\n", " def __getitem__(self, idx):\n", " signal = self.df.loc[idx, self.data_columns].astype('float32')\n", " signal = torch.FloatTensor([signal.values]) \n", " target = torch.LongTensor(np.array(self.df.loc[idx, 'class']))\n", " return signal, target\n", "\n", " def __len__(self):\n", " return len(self.df)\n", "\n", "id_to_label = {\n", " 0: \"Normal\",\n", " 1: \"Artial Premature\",\n", " 2: \"Premature ventricular contraction\",\n", " 3: \"Fusion of ventricular and normal\",\n", " 4: \"Fusion of paced and normal\"\n", "}\n", "\n", "def get_dataloader(phase: str, batch_size: int = 96) -> DataLoader:\n", " '''\n", " Dataset and DataLoader.\n", " Parameters:\n", " pahse: training or validation phase.\n", " batch_size: data per iteration.\n", " Returns:\n", " data generator\n", " '''\n", " df = pd.read_csv(config.train_csv_path, header=None)\n", " df.rename(columns={187: 'class'}, inplace=True)\n", " df['label'] = df.iloc[:, -1].map(id_to_label)\n", " train_df, val_df = train_test_split(\n", " df, test_size=0.15, random_state=config.seed, stratify=df['label']\n", " )\n", " train_df, val_df = train_df.reset_index(drop=True), val_df.reset_index(drop=True)\n", " df = train_df if phase == 'train' else val_df\n", " dataset = ECGDataset(df)\n", " dataloader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=4)\n", " return dataloader" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "gather": { "logged": 1644332993675 } }, "outputs": [], "source": [ "class Swish(nn.Module):\n", " def forward(self, x):\n", " return x * torch.sigmoid(x)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "gather": { "logged": 1644332993801 } }, "outputs": [], "source": [ "class ConvNormPool(nn.Module):\n", " \"\"\"Conv Skip-connection module\"\"\"\n", " def __init__(\n", " self,\n", " input_size,\n", " hidden_size,\n", " kernel_size,\n", " norm_type='bachnorm'\n", " ):\n", " super().__init__()\n", " \n", " self.kernel_size = kernel_size\n", " self.conv_1 = nn.Conv1d(\n", " in_channels=input_size,\n", " out_channels=hidden_size,\n", " kernel_size=kernel_size\n", " )\n", " self.conv_2 = nn.Conv1d(\n", " in_channels=hidden_size,\n", " out_channels=hidden_size,\n", " kernel_size=kernel_size\n", " )\n", " self.conv_3 = nn.Conv1d(\n", " in_channels=hidden_size,\n", " out_channels=hidden_size,\n", " kernel_size=kernel_size\n", " )\n", " self.swish_1 = Swish()\n", " self.swish_2 = Swish()\n", " self.swish_3 = Swish()\n", " if norm_type == 'group':\n", " self.normalization_1 = nn.GroupNorm(\n", " num_groups=8,\n", " num_channels=hidden_size\n", " )\n", " self.normalization_2 = nn.GroupNorm(\n", " num_groups=8,\n", " num_channels=hidden_size\n", " )\n", " self.normalization_3 = nn.GroupNorm(\n", " num_groups=8,\n", " num_channels=hidden_size\n", " )\n", " else:\n", " self.normalization_1 = nn.BatchNorm1d(num_features=hidden_size)\n", " self.normalization_2 = nn.BatchNorm1d(num_features=hidden_size)\n", " self.normalization_3 = nn.BatchNorm1d(num_features=hidden_size)\n", " \n", " self.pool = nn.MaxPool1d(kernel_size=2)\n", " \n", " def forward(self, input):\n", " conv1 = self.conv_1(input)\n", " x = self.normalization_1(conv1)\n", " x = self.swish_1(x)\n", " x = F.pad(x, pad=(self.kernel_size - 1, 0))\n", " \n", " x = self.conv_2(x)\n", " x = self.normalization_2(x)\n", " x = self.swish_2(x)\n", " x = F.pad(x, pad=(self.kernel_size - 1, 0))\n", " \n", " conv3 = self.conv_3(x)\n", " x = self.normalization_3(conv1+conv3)\n", " x = self.swish_3(x)\n", " x = F.pad(x, pad=(self.kernel_size - 1, 0)) \n", " \n", " x = self.pool(x)\n", " return x" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "gather": { "logged": 1644332993953 } }, "outputs": [], "source": [ "class RNN(nn.Module):\n", " \"\"\"RNN module(cell type lstm or gru)\"\"\"\n", " def __init__(\n", " self,\n", " input_size,\n", " hid_size,\n", " num_rnn_layers=1,\n", " dropout_p = 0.2,\n", " bidirectional = False,\n", " rnn_type = 'lstm',\n", " ):\n", " super().__init__()\n", " \n", " if rnn_type == 'lstm':\n", " self.rnn_layer = nn.LSTM(\n", " input_size=input_size,\n", " hidden_size=hid_size,\n", " num_layers=num_rnn_layers,\n", " dropout=dropout_p if num_rnn_layers>1 else 0,\n", " bidirectional=bidirectional,\n", " batch_first=True,\n", " )\n", " \n", " else:\n", " self.rnn_layer = nn.GRU(\n", " input_size=input_size,\n", " hidden_size=hid_size,\n", " num_layers=num_rnn_layers,\n", " dropout=dropout_p if num_rnn_layers>1 else 0,\n", " bidirectional=bidirectional,\n", " batch_first=True,\n", " )\n", " def forward(self, input):\n", " outputs, hidden_states = self.rnn_layer(input)\n", " return outputs, hidden_states" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "gather": { "logged": 1644332994075 } }, "outputs": [], "source": [ "class RNNAttentionModel(nn.Module):\n", " def __init__(\n", " self,\n", " input_size,\n", " hid_size,\n", " rnn_type,\n", " bidirectional,\n", " n_classes=5,\n", " kernel_size=5,\n", " ):\n", " super().__init__()\n", " \n", " self.rnn_layer = RNN(\n", " input_size=46,\n", " hid_size=hid_size,\n", " rnn_type=rnn_type,\n", " bidirectional=bidirectional\n", " )\n", " self.conv1 = ConvNormPool(\n", " input_size=input_size,\n", " hidden_size=hid_size,\n", " kernel_size=kernel_size,\n", " )\n", " self.conv2 = ConvNormPool(\n", " input_size=hid_size,\n", " hidden_size=hid_size,\n", " kernel_size=kernel_size,\n", " )\n", " self.avgpool = nn.AdaptiveMaxPool1d((1))\n", " self.attn = nn.Linear(hid_size, hid_size, bias=False)\n", " self.fc = nn.Linear(in_features=hid_size, out_features=n_classes)\n", " \n", " def forward(self, input):\n", " x = self.conv1(input)\n", " x = self.conv2(x)\n", " x_out, hid_states = self.rnn_layer(x)\n", " x = torch.cat([hid_states[0], hid_states[1]], dim=0).transpose(0, 1)\n", " x_attn = torch.tanh(self.attn(x))\n", " x = x_attn.bmm(x_out)\n", " x = x.transpose(2, 1)\n", " x = self.avgpool(x)\n", " x = x.view(-1, x.size(1) * x.size(2))\n", " x = F.softmax(self.fc(x), dim=-1)\n", " return x" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "gather": { "logged": 1644332994213 } }, "outputs": [], "source": [ "class Meter:\n", " def __init__(self, n_classes=5):\n", " self.metrics = {}\n", " self.confusion = torch.zeros((n_classes, n_classes))\n", " \n", " def update(self, x, y, loss):\n", " x = np.argmax(x.detach().cpu().numpy(), axis=1)\n", " y = y.detach().cpu().numpy()\n", " # print('here!', recall_score(x,y, average='macro', zero_division=1))\n", " self.metrics['loss'] += loss\n", " self.metrics['accuracy'] += accuracy_score(x,y)\n", " self.metrics['f1'] += f1_score(x,y,average='macro')\n", " self.metrics['precision'] += precision_score(x, y, average='macro', zero_division=1)\n", " self.metrics['recall'] += recall_score(x,y, average='macro', zero_division=1)\n", " \n", " self._compute_cm(x, y)\n", " \n", " def _compute_cm(self, x, y):\n", " for prob, target in zip(x, y):\n", " if prob == target:\n", " self.confusion[target][target] += 1\n", " else:\n", " self.confusion[target][prob] += 1\n", " \n", " def init_metrics(self):\n", " self.metrics['loss'] = 0\n", " self.metrics['accuracy'] = 0\n", " self.metrics['f1'] = 0\n", " self.metrics['precision'] = 0\n", " self.metrics['recall'] = 0\n", " \n", " def get_metrics(self):\n", " return self.metrics\n", " \n", " def get_confusion_matrix(self):\n", " return self.confusion" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "gather": { "logged": 1644397187037 } }, "outputs": [], "source": [ "class Trainer:\n", " def __init__(self, net, lr, batch_size, num_epochs):\n", " self.net = net.to(config.device)\n", " self.num_epochs = num_epochs\n", " self.criterion = nn.CrossEntropyLoss(weight=torch.tensor([1,3,3,4,12]).float().to(config.device))\n", " # self.optimizer = AdamW(self.net.parameters(), lr=lr)\n", " self.optimizer = SGD(self.net.parameters(), lr=lr)\n", " # self.scheduler = CosineAnnealingLR(self.optimizer, T_max=num_epochs, eta_min=5e-6)\n", " self.scheduler = StepLR(self.optimizer, step_size=100, gamma=1.0)\n", " self.best_loss = float('inf')\n", " self.phases = ['train', 'val']\n", " self.dataloaders = {\n", " phase: get_dataloader(phase, batch_size) for phase in self.phases\n", " }\n", " self.train_df_logs = pd.DataFrame()\n", " self.val_df_logs = pd.DataFrame()\n", " \n", " def _train_epoch(self, phase):\n", " print(f\"{phase} mode | time: {time.strftime('%H:%M:%S')}\")\n", " \n", " self.net.train() if phase == 'train' else self.net.eval()\n", " meter = Meter()\n", " meter.init_metrics()\n", " \n", " for i, (data, target) in enumerate(self.dataloaders[phase]):\n", " data = data.to(config.device)\n", " target = target.to(config.device)\n", " \n", " output = self.net(data).to(config.device)\n", " loss = self.criterion(output.to(config.device), target.to(config.device))\n", " \n", " if phase == 'train':\n", " self.optimizer.zero_grad()\n", " loss.backward()\n", " self.optimizer.step()\n", " \n", " meter.update(output, target, loss.item())\n", " \n", " metrics = meter.get_metrics()\n", " metrics = {k:v / i for k, v in metrics.items()}\n", " df_logs = pd.DataFrame([metrics])\n", " confusion_matrix = meter.get_confusion_matrix()\n", " \n", " if phase == 'train':\n", " self.train_df_logs = pd.concat([self.train_df_logs, df_logs], axis=0)\n", " else:\n", " self.val_df_logs = pd.concat([self.val_df_logs, df_logs], axis=0)\n", " \n", " # show logs\n", " print('{}: {}, {}: {}, {}: {}, {}: {}, {}: {}'\n", " .format(*(x for kv in metrics.items() for x in kv))\n", " )\n", " fig, ax = plt.subplots(figsize=(5, 5))\n", " cm_ = ax.imshow(confusion_matrix, cmap='hot')\n", " ax.set_title('Confusion matrix', fontsize=15)\n", " ax.set_xlabel('Actual', fontsize=13)\n", " ax.set_ylabel('Predicted', fontsize=13)\n", " plt.colorbar(cm_)\n", " plt.show()\n", " \n", " return loss\n", " \n", " def run(self):\n", " for epoch in range(self.num_epochs):\n", " self._train_epoch(phase='train')\n", " with torch.no_grad():\n", " val_loss = self._train_epoch(phase='val')\n", " self.scheduler.step()\n", " \n", " if val_loss < self.best_loss:\n", " self.best_loss = val_loss\n", " print('\\nNew checkpoint\\n')\n", " self.best_loss = val_loss\n", " torch.save(self.net.state_dict(), f\"best_model_epoc{epoch}.pth\")\n", " #clear_output()\n", " " ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "gather": { "logged": 1644397187238 } }, "outputs": [], "source": [ "attn_model = RNNAttentionModel(1, 64, 'lstm', False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n", "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n", "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n", "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n", "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n", "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n", "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "train mode | time: 09:00:00\n", "loss: 1.5601879076803884, accuracy: 0.07522580645161298, f1: 0.06748332740742076, precision: 0.3165194883003588, recall: 0.2457613616641853\n", "val mode | time: 09:00:31\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAU4AAAEnCAYAAADGqKr7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3de7hdVX3u8e9riEAFROSeoFABK1DFknJQ2nMolJJWKuiRGvsoaYvGUmixh7aCx1at91bQYpVHFAqoJVC8gFTAyEUPFgIJIhACGiXKhkAM10AlGHjPH3MsXdnZt7mYa629st7P88xnzzXWvPz23tm//MYYc84l20RExNQ9p98BREQMmiTOiIiakjgjImpK4oyIqCmJMyKips36HUBEbBrmzp3rNWvW1Npn6dKlV9qe26WQuiaJMyIasWbNGpYsWVJrH0nbdymcrkrijIiGGFjf7yB6IokzIhqUxBkRUUMqzoiImpI4IyJqSuKMiKhpeBJnLoDvMkmvl3S1pEckrZP0fUkf6NZlGJIOlnSzpCclNfboK0nvlVTvIr1pTNICSUfX2P5cSfWutRk6rcRZZxlMqTi7SNJpwDuAfwM+DjwG7AP8ObAv8LounPYzwGrgCGBdg8f9HPC1Bo/XbwuA24GvTnH79wNbdi+cTcXgJsM6UnF2iaQ/BP4P8Dbbb7X9Ndvfsn0m8BvAWV069a8BXy3nuqGpg9oesb20qeMNCklbAtj+oe3b+x3P9Gbg6ZrLxCRtIelGSd+TtEzS+0r7dpIWSfpB+fqCtn1OlbRC0l2SjmhrP0DSbeW9MySptG8u6cLSvljS7pPFlcTZPX8N3Gz7nNFv2H7a9uWt15K2l3SepAcl/bekayXNad9H0kpJH5P015JGJD0saaGkbcv7h5Su+QzgXyRZ0rnlPUs6cdTxNuh6S9pW0uck3Ve6+T+R9Nnxti9te0j6qqTHJK2V9DVJe47axpJOkvQhST+VtFrSpyRtPtEPr9U1lvQaSXeUn8t/lj+YPSVdI+mJss3LR+17sqSbJD0q6YHRcUm6FjgAmF/is6Q/afs5nybp7yWNUPUSNuqqS7pM0p2txNp23icl7TvR97bp6kpXfR1wqO1XAPsDcyUdBJwCXGV7L+Cq8hpJ+wDzqHp0c4FPS5pRjnUmVU9jr7K0bvU8DnjY9p5UPcOPThZUEmcXSJoJvBq4Yoq7fJWqa/03wBupfi/XjE5CwB8Bh1H98t8JHAl8qLx3M/Cqsn5aWX9/jbBPB36LKuEfAbyL6i9hTCXxXQW8DHgb8CfAHsC3JG03avOTgV2BNwP/DLwdOGkKMb0I+Efg3VTf86upKvWFZXkD1XDTwlb1UMwG/hU4qsQ2A/iOpOeX9/8CuBP4OtXP6VXAf7bt/8fA/yrbvXGc2N4G7AB8GEDSy4APAO+xvWwK39smqPnE6crj5eXMspjqd3teaT8PaI1XHwUstL3O9t3ACuBASbsA29i+3tXHXpw/ap/WsS4GDhv172kjGePsjhcCmwM/mWxDSXOBg4FDbH+rtF0NrAT+lirJtPwcONr2+rJd63/Xv7D9GHBD+X2v7KCbfiDwKdsXtrV9YYLt/5Qqse1t+0clnsXAj0rMH27bdqXtPynrV0o6GHg98E+TxLQd8CrbPyzHfznVz2S+7fNLm6iS3q8BywFs/3XrAKXaWEQ17nsUcL7tOyQ9Afx0gp/TkbafHC8w26tKFf9FSV8r3+93gY9N8j3FhrbXhpNuZ9neYBir/A6XAntS/RtdLGkn26vgF7+LHcvms4D23+lIaft5WR/d3trnnnKs9ZIepfobHncyNImzu6Yyq30g1R/wt36xk/2EpMuoKsB217SSZnEHsKOk59p+6lnGegvwt5KeBr5p+/tTiPvmVtIscY9I+s4YcX9j1Os7gDlMbmUraRYryterx2ibRUmcpSv3fqqx5Pbqd+8pnBOqLuC4SbPF9gWSXk+VuJ8BXmF78oG7TVrtyaE1tif8t1B+pvuXYamvSNpvgs3HqhQ9QftE+4wrXfXueJBqbOZFU9h2F+CBMdofYMM/eoBHRr1+iuqX/ty6AY7hRKohg38A7lI16D5vgu2fbdxbTCGmsfYb3d5q2wJA0ouoErWoKt+Dgd+kqjinck4Y+/sazwVUvYtFtn9QY79NUHcvR7L9CHAt1djkA6X7Tfm6umw2AuzWttts4L7SPnuM9g32kbQZ8HzgoYliSeLsAts/B75DNVY4mVXAjmO078Qkv7wa1rFxct0gudl+xPZf2d4ZeAWwmKobus84x+xF3J2YC/wKcJTti23/F1U1PTqZT2RK179K2oZqMuG7wGvVNoM7nJpPnJJ2aJsA3RL4Xarx6UuB+WWz+cAlZf1SYJ6qmfI9qCaBbizd+rWSDirDO8eO2qd1rDcAV3uSj/9N4uyeTwBzJM0f/Yak55SxTagS1I6S/mfb+78CvAa4rqFYRqgmcX5xfuDQ8Ta2fSvVWOJzqMYOx7IYOKD842wddxbVBE5TcXdiS6puc/tf5R+x8bDUVKveiXyCauLpUODfgc+1TUANoa5UnLtQTZTeCtxEVdlfBnwEOFzSD4DDy2vKxNxFVMNBVwAntA2fHE91PfIK4IdA68qWs4EXSlpBdQnhKZMFlTHOLrH9NUmnA2eXyZBLgMepEtGfU03+XGH7yjIueKGkU6i6+X9DlQD+uaFwvgKcIOm7VJM3bwW2ad9A0nVlu9up/gLeBjwB3DjOMc+lmtm/XNI/UF2U916qAfXPNBR3J66mSmb/JulsqstS/oaNu/13AkeUKvFB4G7bD071JJKOpJog+33bj0j6S6qf3b9QXWEwhJq/5bL8J/7KMdofpLrCZKx9Pgh8cIz2JcBG46NlPPuYOnGl4uwi2ydTXc6yF1VFsojq0pyrqP73a3ldee8TwH9Qjc8dansFzXhfOe4HqBLeLcDo60uvp/qDv5jqf+ztqZLCCGOwvY5fdpvOprqc48dUVwf0ratu+zaqhPY/gMuoLi06Bnh01KYfoJpMuoiqkvnDqZ6jXG51FvBZ21eU8z5E9Z/NfFU3Pwyp4bjlUpN05SMipmTOnL29ZMkna+0jzV062az6dJSuekQ0ZHiejpTEGRENSeKMiKgpiTMiogNJnH01Q/LMfgdR034HHNDvEGq5c+ngPSVuEC8D2fuAwYp65cpnWLPGEz7kYmypOPtuJhveNzUIliwZrAeEv3riB8BMS8/rdwAdWLTk2V5n31tz5kx6m/44kjgjImpqPch405fEGRENScUZEdGB4UicgzVqHRExDaTijIiGpKseEVFTEmdERE1JnBERNSVxRkR0IIkzIqKGVJwRETUlcUZE1JTEGRFRUxJnREQHkjgjImpIxRkRUVMSZ0RETcOTOHv2dCRJcyXdJWmFpFN6dd6I6KWnay6DqScVp6QZwKeAw4ER4CZJl9q+oxfnj4heGJ6Ks1dd9QOBFbZ/BCBpIXAUkMQZsckYnsTZq676LOCettcjpW0DkhZIWiJpyeAW8RGxqetVxTnWxyl6owb7LOAsgC2kjd6PiOlseCrOXiXOETb8tN/ZwH09OndE9EQSZ9NuAvaStAdwLzAP+OMenTsieiKJs1G210s6EbgSmAGcY3tZL84dEb2UxNko218Hvt6r80VErw1PxZmPB46IhrQSZ51lYpJ2k3SNpOWSlkk6qbS/V9K9km4pyx+07XNqudHmLklHtLUfIOm28t4ZklTaN5d0YWlfLGn3yeLKLZcR0ZCuVJzrgZNt3yxpa2CppEXlvY/b/lj7xpL2oZpD2RfYFfimpL1tPw2cCSwAbqDq/c4FLgeOAx62vaekecBHgTdOFFQqzohoSPMVp+1Vtm8u62uB5YxxDXibo4CFttfZvhtYARwoaRdgG9vX2zZwPnB02z7nlfWLgcNa1eh4kjgjokHNJs52pQv9SmBxaTpR0q2SzpH0gtI23s02s8r66PYN9rG9HngUeOFEsSRxRkRDOqo4t2/dLViWBWMdWdJWwJeAd9h+jKrb/RJgf2AVcFpr03ECm+gmnCndoNMuY5wR0ZCOxjjX2J4z0QaSZlIlzS/a/jKA7Qfa3v8scFl5Od7NNiNlfXR7+z4jkjYDng88NFFMqTgjoiFdmVUXcDaw3Pbpbe27tG32OuD2sn4pMK/MlO8B7AXcaHsVsFbSQeWYxwKXtO0zv6y/Abi6jIOOKxVnRDSkK7PqBwNvAW6TdEtpexfwJkn7l5OuBN4OYHuZpIuonry2HjihzKgDHA+cC2xJNZt+eWk/G/i8pBVUlea8yYJK4oyIBjX7XDPb1zH2GOS4N9PY/iDwwTHalwD7jdH+JHBMnbiSOCOiIcNz51ASZ0Q0ZHgSZyaHIiJqSsUZEQ0ZnooziTMiGpTEGRFRQyrOiIiakjj77ufAA5NuNb3MmviBKtPOI/0OYEjM0n/3O4RaftrxnkmcERH1eTg+2DuJMyKa80y/A+iNJM6IaIZp+o7LaSuJMyKakcQZEdGBdNUjImpIxRkR0YFUnBERNaTijIjoQBJnREQNJl31iIjaUnFGRNQwRGOceQJ8RERNqTgjojkZ44yIqGGIuupJnBHRnFScERE1pOKMiKhpiBJnT2bVJZ0jabWk23txvojok2dqLgOqV5cjnQvM7dG5IqIfWhVnnWVA9aSrbvvbknbvxbkioo8GOBnWkTHOiGhG7lXvD0kLgAUAg/VBuxEBpOLsB9tnAWcBzJDc53Aioo5UnBERHRiSirNXlyNdAFwPvFTSiKTjenHeiOihzKo3y/abenGeiOizIemq57FyEdGMLlScknaTdI2k5ZKWSTqptG8naZGkH5SvL2jb51RJKyTdJemItvYDJN1W3jtDkkr75pIuLO2Lp3LpZBJnRDSn+a76euBk2y8DDgJOkLQPcApwle29gKvKa8p784B9qW66+bSkGeVYZ1JdtbNXWVo35RwHPGx7T+DjwEcnCyqJMyKmLdurbN9c1tcCy4FZwFHAeWWz84Cjy/pRwELb62zfDawADpS0C7CN7ettGzh/1D6tY10MHNaqRseTxBkRzWhdjlTvXvXtJS1pWxaMd/jShX4lsBjYyfYqqJIrsGPZbBZwT9tuI6VtVlkf3b7BPrbXA48CL5zoW83lSBHRnPoz5Wtsz5lsI0lbAV8C3mH7sQkKwrHe8ATtE+0zrlScEdGMzirOSUmaSZU0v2j7y6X5gdL9pnxdXdpHgN3adp8N3FfaZ4/RvsE+kjYDng88NFFMSZwR0ZzmZ9UFnA0st31621uXAvPL+nzgkrb2eWWmfA+qSaAbS3d+raSDyjGPHbVP61hvAK4u46DjSlc9IprRnQcZHwy8BbhN0i2l7V3AR4CLys00PwGOAbC9TNJFwB1UM/In2G5FdTzVIy63BC4vC1SJ+fOSVlBVmvMmC0qTJNa+mSH5ef0Ooqat+x1ATY/0O4AhsW2/A6jpp8BTdu3n7Mx5ibzkQ/X20TyWTmWMc7pJxRkRzRiij85I4oyIZiRxRkR0YEjuVU/ijIhmpOKsSPqHqRzE9j82E05EDLRUnAD8dtu6gP8J3A/8GHgxsDPwre6EFhEDJRVnxfbhrXVJpwNXAx9uXRwq6VRg+65GGBGDI4lzI8cCO4+6ov6fqSrQkxuNKiIGTz5zaEw/A/YDbmlr+3XgyUYjajNo/3lt1+8AauraLy428OJ+B1DTo89m50H7o+1QncT5aeAKSZ8BVgK7Uz0U9JPNhxURAycV58Zsf1jSCNV9o8cA9wLvtH1+t4KLiJiOal3HafvzwOe7FEtEDLp01Tcm6Vepnhyyq+0TJe0NzLS9rCvRRcTgGKLLkab8PE5JhwPfo/rApGNL8w7Ax7oQV0QMoi48yHg6qlNxfgQ4xvYVkh4ubTcDv9F8WBExcIao4qyTOF9i+4qybgDbPyuPtY+IYTdEibPOR2fcI2m/9gZJr6C6NCkiYmi66nUS5xnAlyW9GZgh6X8DX6D6APeIGHatirPBzxyarupcx/nZ8iFH7wRmAO8DPlEuUYqIGOgqso6613GeBZzVpVgiYpBljHNjkpaP035bc+FExEBLV30js2u2R8Qwyb3qvyTpXa1t29Zb9gTuaTyqiBhMA1xF1jGVirP1MOOZbetQ/d9yP/BnTQcVEQNoiMY4J02ctn8HQNInbf9l90OKiIE1JF31WtdxStq5vUHSTpL2bDimiBhEQ3QdZ53E+e9s/PlCO5T2iIjcOTSGvW3fPqptGbD3ZDtK2k3SNZKWS1om6aRaUUZETCN1Lkd6RNL2tte0tW0PPDGFfdcDJ9u+WdLWwFJJi2zfUSfYiJjGhmhyqE7FuQg4U9JWAOXrJ4FvTLaj7VW2by7ra4HlwKz64UbEtDYkY5x1Ks5TgEuBByWtBnYElgKvrXNCSbsDrwQWj/HeAqoPgEN1DhoR/ZcL4Ddme42kg4HfpPrE05XAklGfsz6hUqV+CXiH7cfGOMcv7oWfIU35uBExTQxwFVlH3Yd8GLixLLWUBx5/Cfii7S/X3T8iprkhGuOcMHFKOsP2X5X1cZ+KZHvBJMcRcDaw3PbpnQQaEQNgSLrqk00OzRy1Pt4ymYOpPo/9UEm3lOUPOog3IqarLlwAL+kcSasl3d7W9l5J946VSySdKmmFpLskHdHWfoCk28p7Z5RiDkmbS7qwtC8uczCTmrDitH182/qfTuWA4xznOjLfE7Fp687k0LnAvwLnj2r/uO0NPmFX0j5UH1++L7Ar8E1Je9t+GjiTauL5BuDrwFzgcuA44GHbe0qaB3wUeONkQdW5HCkiYmINV5y2vw08NMWzHwUstL3O9t3ACuBASbsA29i+vszTnA8c3bbPeWX9YuCwVjU6kQkTp6RnJD092TLFbyoiNmWdddW3l7SkbZlwvqTNiZJuLV35F5S2WWz4mMuR0jarrI9u32Af2+uBR4EXTnbyyWbVf7ttfQ7w58BpwN3ArwLvAD4z2UkiYkjU76qvsT2n5j5nAu+nStXvp8pJf8bYw4GeoJ1J3hvXZGOc32mtS/pX4EjbPyxNV0m6mqq8PWOyE0XEJq5HlyPZfqC1LumzwGXl5QiwW9ums4H7SvvsMdrb9xmRtBnwfKYwNFBnjPMlbPy093upKs+IiJ7cclnGLFteB7Rm3C8F5pWZ8j2AvYAbba8C1ko6qIxfHgtc0rbP/LL+BuDqqdzUU+cC+KXAxyT9ne0nJW0BfAT4bo1jRMSmqguz6pIuAA6hGgsdAd4DHCJp/3LGlcDbAWwvk3QRcAfVg4VOKDPqAMdTzdBvSTWbfnlpPxv4vKQVVJXmvCnFNdU7JssDi78G7A607lX/MfBa29+f0kFqmCF5i6YP2mWDVnrfN/km0YCX9juAmm4FHrdrXz44Z2t5yf719tF1LO1gjLPv6tyrvkLSfsBBVDNR9wI3tGX0iBhmecjH2Gw/Lem/gJ3LuEFExNCZ8uSQpK0knQ38jOrCUiQdLek93QouIgbMkDyPs86s+mnATlT3nT9V2m5iCrcnRcQQGKIPa6vTVT8S2Mf2oyrPyrR9r6RduxNaRAycjHFuRFTd9F82VA8mfrzRiCJiMA3R8zjrdNW/A5w6qu0vgWuaCyciBla66mM6meo2yzcDW0m6jepZnId1JbKIGDzpqm/I9k/KdZxHAntQXfx+me2fTbxnRAyFIeqqTylxlpvfHwR2sv2l7oY0uAbtTpwn+x3AkPhevwOo6Vn9u0jF+Uu210taQ9U1z99bRGxsiCrOOpND7wHOlDRr0i0jYjhlcmgj/wbMAN4k6RnaHvZp+7lNBxYRAyb3qm+oPBnpjcC2wA8n2TwihtUAV5F1TJo4Jb0euJCq2nwKeL3tr3c7sIgYMBnj3MC7gXcBW1ONc76rqxFFxOB6puYyoKaSOPcATrP9BHA6sGd3Q4qIgZQ7hzYww/YzALZ/LikTQRExtgGuIuuYSuJ8rqT27vkWo15j+0PNhhURA2eIxjinkjhvAA5ve7141GsDSZwRMTQmTZy2D+lBHBGxKUjFGRFRQy6Aj4joQCrOiIgaMjkUEdGBdNUjIuoZkoIziTMimjFEPfUkzohozpD01JM4I6IZqTgbJmkL4NvA5uWcF9t+Ty/OHRG9k4qzWeuAQ20/LmkmcJ2ky23f0KPzR0SXpeJsmG0Dj5eXM8vi8feIiEEzTImzzoe1PSuSZki6BVgNLLK9eIxtFkhaImlJsmrE4BmS5xj3LnHaftr2/sBs4EBJ+42xzVm259ieo14FFhGNGKLnGPcucbbYfgS4Fpjb63NHRHc1nTglnSNptaTb29q2k7RI0g/K1xe0vXeqpBWS7pJ0RFv7AZJuK++dIUmlfXNJF5b2xZJ2n8r32ZPEKWkHSduW9S2B3wXu7MW5I6I3Wg9Harirfi4bF1mnAFfZ3gu4qrxG0j7APGDfss+nJc0o+5wJLAD2KkvrmMcBD9veE/g48NGpBNWrinMX4BpJtwI3UY1xXtajc0fEgLL9beChUc1HAeeV9fOAo9vaF9peZ/tuYAXVsOAuwDa2ry8T1eeP2qd1rIuBw1rV6ER6Nat+K/DKXpwrIvqng3HL7SUtaXt9lu2zJtlnJ9urAGyvkrRjaZ9F9YkVLSOl7edlfXR7a597yrHWS3oUeCGwZqIAcudQRDSiw+cYr7E9p6EQxqoUPUH7RPtMqOeTQxGx6erRrPoDpftN+bq6tI8Au7VtNxu4r7TPHqN9g30kbQY8n42HBjaSxBkRjejh5UiXAvPL+nzgkrb2eWWmfA+qSaAbS7d+raSDyvjlsaP2aR3rDcDVZRx0QumqR0Rjmr6oXdIFwCFUY6EjwHuAjwAXSToO+AlwDIDtZZIuAu4A1gMn2G7l5+OpZui3BC4vC8DZwOclraCqNOdNKa4pJNe+mCF5i34HUdOgxftkvwOIaelJ4Gm79j0o+0peWHOfl8PSBsc4eyYVZ0Q0YpjuVU/ijIjGDPL953UkcUZEI1JxRkR0IBVnREQNqTgjIjqQxBkRUUOHt1wOpCTOiGhMKs6IiBoyxhkR0YFh6arnIR8RETWl4mzQi/sdQE139TuADszsdwAd+O1+B1DT/+twv3TVIyI6MCxd9STOiGhEKs6IiA4kcUZE1JAL4CMiOpCKMyKihoxxRkR0IF31iIgaUnFGRNSUyaGIiA6k4oyIqCFd9YiIDqSrHhFRQyrOiIgOJHFGRNQwTLPqeZBxRERNqTgjojHpqkdE1DBMXfWeJk5JM4AlwL22j+zluSOi+1JxdsdJwHJgmx6fNyK6bJguR+rZ5JCk2cBrgM/16pwR0VvP1FwGVS8rzk8AfwdsPd4GkhYACwDUo6AiohmpOBsm6Uhgte2lE21n+yzbc2zPSeKMGCytxFlnmQpJKyXdJukWSUtK23aSFkn6Qfn6grbtT5W0QtJdko5oaz+gHGeFpDMkdZxmetVVPxh4raSVwELgUElf6NG5I6JHuthV/x3b+9ueU16fAlxley/gqvIaSfsA84B9gbnAp8ukNMCZVD3avcoyt5PvEXqUOG2fanu27d2pvqmrbb+5F+eOiN7oVsU5jqOA88r6ecDRbe0Lba+zfTewAjhQ0i7ANravt23g/LZ9asudQxHRmA4qzu0lLWlbFoxxWAPfkLS07f2dbK8CKF93LO2zgHva9h0pbbPK+uj2jvT8Anjb1wLX9vq8EdFdHU4OrWnrfo/nYNv3SdoRWCTpzgm2HWvc0hO0dyR3DkVEY7oxq277vvJ1taSvAAcCD0jaxfaq0g1fXTYfAXZr2302cF9pnz1Ge0fSVY+IRrRuuWxyckjS8yRt3VoHfg+4HbgUmF82mw9cUtYvBeZJ2lzSHlSTQDeW7vxaSQeV2fRj2/apLRVnRDSmCxXnTsBXypVDmwH/bvsKSTcBF0k6DvgJcAyA7WWSLgLuANYDJ9huhXU8cC6wJXB5WTqSxBkRjejGBfC2fwS8Yoz2B4HDxtnng8AHx2hfAuzXRFxJnBHRmEG+jbKOJM6IaERuuYyIiHGl4oyIxqSrHhFRwzB11ZM4I6IxSZwRETXkM4ciIjqQijMiooaMcUZE1JSuekREB1JxRkTUkIozIqIDqTj77BlY89/w4y4centgTReOy3e7cdBK12LukkGLF7oY82XdOGh3f8Yv7mSnTA5NA7Z36MZxJS2ZwqP6p5VBi3nQ4oXBi3m6xpuuekREDak4IyI6kMS56Tqr3wF0YNBiHrR4YfBinnbxDtOsuqrPZo+IeHa2klz3cykWw9LpOFY7mTzIOCKipmHsqkdEFwxTV31oKk5JcyXdJWmFpFP6Hc9kJJ0jabWk2/sdy1RJ2k3SNZKWS1om6aR+xzQRSVtIulHS90q87+t3TFMlaYak70rq0mWinXm65jKohiJxSpoBfAr4fWAf4E2S9ulvVJM6F5jb7yBqWg+cbPtlwEHACdP857wOONT2K4D9gbmSDupzTFN1ErC830G0a12OlMS56TgQWGH7R7afAhYCR/U5pgnZ/jbwUL/jqMP2Kts3l/W1VH/Ys/ob1fhceby8nFmWaT9bKmk28Brgc/2OZbRnai6DalgS5yzgnrbXI0zjP+hNgaTdgVcCi/sbycRKl/cWYDWwyPa0jrf4BPB3TLPck4pz06Mx2qZ9ZTGoJG0FfAl4h+3H+h3PRGw/bXt/YDZwoKS6V9T0lKQjgdW2l/Y7ltGSODc9I8Buba9nA/f1KZZNmqSZVEnzi7a/3O94psr2I8C1TP9x5YOB10paSTXkdKikL/Q3pF9KV33TchOwl6Q9JD0XmAdc2ueYNjmSBJwNLLd9er/jmYykHSRtW9a3BH4XuLO/UU3M9qm2Z9venerf8dW239znsIBUnJsc2+uBE4ErqSYsLrK9rL9RTUzSBcD1wEsljUg6rt8xTcHBwFuoqqBbyvIH/Q5qArsA10i6leo/10W2p9XlPYNmWCrO3HIZEY14ruSda+5zz4Decpk7hyKiMYPc/a4jiTMiGjFMt1wmcUZEY1JxRkTUkCfAR0R0IF31iD6SdC6w3vZb+x1LTM0wVZxDcR1nTJ2kd0uypGNr7GNJv9XNuCKmk1Sc8QuSngMcR/VUprcD5/c3ohgkz8CVa6vPe6+jW58N31VJnNHuCKr7+I8GLlIKVcoAAAHTSURBVJO0n+3bASS9HPgn4ABgBtWFy4dL+l7Z9xuSngEW2n5ruZf63ba/UPbfHbgb2M32iKTDgA8Be1M9x/Mq4K9sr+7NtxpNsz3d7/NvTLrq0e7twOW2/xP4HrAAQNIuwLfKsjuwM/BRgPIQYIDfs71VjTHJdVS3we4A/DqwK/AvzXwbEd2VxBkASNqV6uG455Smc4C3lIdfvIXqQdAftv2E7adsf/PZnM/2dbZvsr3e9v1U1exhz+aYEb2SxBktrbHN1kMuvgBsCbyRqsr8fpMnk3SApCsl3S/pMeACquozYtpL4ozWpNBbgW2BEUn3A3dQjWUuAFYCe01wiLGeFPM48Ly217uOen8hcDOwt+1tgDd1FHxEHyRxBlQP750NvJrqQ8tay2uAV1F9/MVLJb1T0q9Imlkmd1ruZ+PEuoTqQ/G2krQD8Pej3t8GeBRYK+lFwLT/5NGIliTOgGpS6Ku2l9q+v235BtUzQY8BDgEOp3qa/gPAO9v2/7/AP0p6WNJnStu7qa6HXkX1ZPWFo865gKrKXQt8GfiPbnxjEd2Q53FGRNSUijMioqYkzoiImpI4IyJqSuKMiKgpiTMioqYkzoiImpI4IyJqSuKMiKjp/wMtX5EUCzmbdQAAAABJRU5ErkJggg==", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "loss: 1.3856678911868263, accuracy: 0.11747619720965312, f1: 0.07200591838594485, precision: 0.27436380889422163, recall: 0.3566003931926984\n", "\n", "New checkpoint\n", "\n", "train mode | time: 09:00:38\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAUgAAAEnCAYAAADLttq8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3de7hcdX3v8ffHcIvSFGi4xASFalCBFmzSNErbY0VLWimhnkONfZDoQWMpKvZQK3hsqa0obZUiPcJjBCUUS0y9gSiXNIB9tNzCxUK4lFQQIoEYFImogcDn/LF+W4bNzOyZzZqZvWd/Xs+znr3mN+u31i+BfPfvvmSbiIh4tucNugARERNVAmRERAsJkBERLSRARkS0kAAZEdHCdoMuQEQMh0WLFnnz5s1d5bnxxhsvt72oR0V6zhIgI6IWmzdvZu3atV3lkTSzR8WpRQJkRNTEwLZBF6JWCZARUaMEyIiIJlKDjIhoIQEyIqKFBMiIiBaGL0BmoniPSXqjpCslPSJpq6T/kvThXk1vkHSIpJsk/UxSbVs1SfprSd1NcpvAJC2TdGQX158nqbs5LFPOSIDs5pjYUoPsIUkfB94LfBb4R+BRYH/gT4ADgD/swWM/BWwCDgO21njfc4Cv1ni/QVsG3AZ8pcPr/xaY3rviDIuJH/S6kQDZI5L+APg/wLG2P9Pw1TckLQd+t0ePfjmw3PY36ryp7Q3AhjrvORlImm77p7b/e9BlmfgMPDnoQtQqTeze+TPgplHBEQDbT9q+dOSzpJmSVkh6WNJPJF0taX5jHkn3SvqYpD+TtEHSDyWtlLRL+f41pUk9DfiEJEs6r3xnSe8adb9nNJkl7SLpHEkPlOb5fZI+3er6kravpK9IelTSFklflfTSUddY0gmSPiLp+5I2SfqkpB3b/eWNNGklvUHS7eXv5WuSdpP0UklXSXqsXPOro/KeKOkGST+S9NDockm6GpgHLC3ls6S3Nvw9f1zSX0raQFXrf1YTW9Ilku6UNH3Uc38m6YB2f7bhNXxN7ATIHpC0PfBq4LIOs3yFqkn858CbqP67XDU62AB/BBxK1Tx8P3A48JHy3U3Aq8r5x8v533ZR7NOB36QK7IcBH6D6P76pEuDWAK8A3gG8FdiXqoa826jLTwReCBwN/APwTuCEDsr0IuBvgA9S/ZlfDSwHVpbjf1G1glZKUkO+OcD/AxaXsk0DviXpF8v3fwrcCXyd6u/pVcDXGvL/MfA/ynVvalG2dwC7Ax8FkPQK4MPAKbbXdfBnG0LDFyDTxO6NXwJ2BO4b60JJi4BDgNeMNIslXQncC7yPKpiMeAI40va2ct3+wBLgT20/Clxb4sS9tq/tsswLgE/a/nxD2gVtrn8bVQDbz/Z3SnmuA75TyvzRhmvvtf3Wcn65pEOANwJ/P0aZdgNeNdK8LTXF9wFLbZ9f0kQV3F4O3AFg+89GbiBpGrCaql92MXC+7dslPQZ8v83f0+G2f9aqYLY3llr55yR9tfx5bwY+NsafKSaRBMje6mQUeQHVP9Sf9xnafkzSJVQ1ukZXjQTH4nZgD0k72H78OZb1FuB9kp4E/s32f3VQ7ptGgmMp9wZJ32pS7itGfb4dmM/Y7h3V97e+/LyySdpsSoCUtJCq9vxrVEF2xH4dPBNgTbvgOML2hZLeSBWgnwIOsj1cnXBdm/i1wm6kid0bD1ONIL+og2tnAQ81SX+IZ/7jBnhk1OfHAQE7dFvAJt5F1dT/K+AuSXdLWtLm+uda7p06KFOzfKPTR9J2ApD0IqqALKqa7CHAr1PVIDt5JjT/c7VyIVVrYbXtu7vIN4SGr4mdANkDtp8AvkXVlzeWjcAeTdL3BH5QU5G28uwg+owgZvsR2++xvRdwEHAdVfNx/xb37Ee5x2MR8Hxgse0v2P4Pqtrx6KDdTkfzRyXNoJq+dTNwhKRO/nsPsQTI6NwZwHxJS0d/Iel5pe8RqkC0h6Tfbvj++cAbgG/WVJYNVIMpP38+8NpWF9v+T6q+vudR9e01cx0wT9K+DfedTTWQUle5x2M6VXO38V/fH/Hs7qROa7HtnEE1APRa4F+AcxoGgqag4QuQ6YPsEdtflXQ6cG4ZlLgI+DFVwPkTqkGYy2xfXvrtPi/pJKrm+Z9T/UP/h5qK82XgeEk3Uw2ivB2Y0XiBpG+W626j+j/9HcBjwPUt7nke1Uj6pZL+imoC3F8Dm6kmqw/KlVRB67OSzqWakP/nPLu5fidwWKn1PQzcY/vhTh8i6XCqgarfs/2IpHdT/d19gmpEfwrKUsPogu0TqaaJzKWqYaymmvKyBjiu4dI/LN+dAfwrVf/Za22vpx4fKvf9MFVguwUYPT/zGqp/2F8AVgEzqf7xN50cbnsr8DqqQHMusAL4LtVo/MCa2LZvpQpcvwFcQjVl5yjgR6Mu/TDVoM4q4AbgDzp9RpnGtBz4tO3LynN/QPVLZamqRQJT1HDVIGXXtlw3Iqaw+fP389q1/9RVHmnRjbY7mdEwEKlBRkRNetMHWVaPrZN0m6QLJe1UVlStLrMtVkvateH6kyWtl3RX48CZpHmSbi3fnTlqcUFTCZARUZP6A2QZ+HsPMN/2gVT9y0uAk6jmq86l6rI6qVw/snjiAKoZDWeVxQIAZ1OtyJpbjjHfppgAGRE16dko9nbAdEnbUU3heoBqVdSK8v0KYGTrusXASttbbd9DtZBggaRZwAzb17jqVzy/IU9LCZARUaOuA+TMsuHIyLGs8W62v0e1fPM+qrm3P7J9BbCn7Y3lmsY5ubOB+xtusaGkzeaZu1GNpLc1Yaf5bCe5juUh/TRj7EsmlI7ntEwgTw26AOPwynnzBl2Ertx7771s3rx5zP65ZxvXNJ/N7QZpSt/iYqqNUB4B/lXS0W3u16zcbpPe1oQNkDvQeobyRPX6QRegS58ddAHG4bFBF2Ac1q6dXBuRz58/3kHlnsyDfB3VHNXvA0j6EtVihIckzSqbhsyiWkoKVc1w74b8c6ia5BvK+ej0ttLEjoiajGyY280xpvuAhZKeX0adD6Wav3oxMLJKbSnVQgxK+hJJO5ZVXnOB60szfIukheU+xzTkaWnC1iAjYrKpvwZp+zpJX6Da73Qb1br35cDOwCpJx1IF0aPK9eskraLaMWobcHzDDkvHUS2UmA5cWo62EiAjokb1r46xfQpwyqjkrVS1yWbXnwqc2iR9LXBgN89OEzsiooXUICOiJsO3WUUCZETUJAEyIqKFBMiIiBYSICMi2kiAjIhoIjXIiIgWEiAjIlpIgIyIaCEBMiKijQTIiIgmUoOMiGghATIiooXhC5B9281H0qLyGsb1kk7q13Mjop9q3zB3oPpSgyyvXfwk1VsJNgA3SLrY9u39eH5E9MPw1SD71cReAKy3/R0ASSupXsSTABkxNIYvQParid3qVYzPIGnZyOsfh+uvOSImo37VIDt65aLt5VTvm+D50pivZIyIiWT4apD9CpCtXsUYEUMjAXK8bgDmltcwfg9YAvxxn54dEX2RADkutrdJehdwOTAN+Iztdf14dkT003AFyL7Ng7T9ddv72X5JeS1jRAyVkRpkN0d7kl4m6ZaG41FJ75W0m6TVku4uP3dtyHNymW99l6TDGtLnSbq1fHempGZjI8+Q175GRE3qD5C277J9sO2DgXnAT4AvAycBa2zPBdaUz0jan6oL7wBgEXBWmYcNcDawDJhbjkVjPT8BMiJqUn+AHOVQ4L9tf5dqHvWKkr4COLKcLwZW2t5q+x5gPbBA0ixghu1rbBs4vyFPS1mLHRE1GdcgzUxJaxs+Ly/T/ZpZAlxYzve0vRHA9kZJe5T02cC1DXlG5lw/Uc5Hp7eVABkRNeo6QG62PX+siyTtABwBnDzWpU3S3Ca9rQTIiKhJT6f5/B5wk+2HyueHJM0qtcdZwKaS3mrO9YZyPjq9rfRBRkRNetoH+Waebl4DXAwsLedLgYsa0pdI2rHMu54LXF+a41skLSyj18c05GkpNciIqElvapCSnk+1E9g7G5JPA1ZJOha4DzgKwPY6SauoNsLZBhxve2RfteOA84DpwKXlaCsBMiJq0psAafsnwC+NSnuYalS72fWnAs+aa217LXBgN89OgIyIGk38TXC7kQAZETXJWuyIiBaGL0BmFDsiooXUICOiJsNXg0yAjIgaJUBGRDSRGmRERAsJkH2zlWqfosnkrkEXICakvcfel3VCeWjsS1pIgIyIaM2ZKB4R0dxTgy5AvRIgI6IeZthWGiZARkRNEiAjItpIEzsioonUICMi2kgNMiKiidQgIyLaSICMiGjCpIkdEdFSapAREU0MYR9kdhSPiGghATIi6vNUl0cHJO0i6QuS7pR0h6RXSdpN0mpJd5efuzZcf7Kk9ZLuknRYQ/o8SbeW786Uxt5mKQEyIuox0sTu5ujMJ4DLbL8cOAi4AzgJWGN7LrCmfEbS/sAS4ABgEXCWpGnlPmcDy4C55Vg01oMTICOiPjXXICXNAH4bOBfA9uO2HwEWAyvKZSuAI8v5YmCl7a2276HaVnaBpFnADNvX2DZwfkOelhIgI6Ie46tBzpS0tuFYNuquvwx8H/ispJslnSPpBcCetjcClJ97lOtnA/c35N9Q0maX89HpbWUUOyLqMb5R7M2257f5fjvg14B3275O0icozekWmvUruk16W32pQUr6jKRNkm7rx/MiYkDqH6TZAGywfV35/AWqgPlQaTZTfm5quH7vhvxzgAdK+pwm6W31q4l9Hh10iEbEJNaDQRrbDwL3S3pZSToUuB24GFha0pYCF5Xzi4ElknaUtC/VYMz1pRm+RdLCMnp9TEOelvrSxLb975L26cezImKAejNR/N3A5yTtAHwHeBtV5W6VpGOB+4CjAGyvk7SKKohuA463f/6inOOoKmvTgUvL0Vb6ICOiHj1ai237FqBZP+WhLa4/FTi1Sfpa4MBunj2hAmQZwVoGzXtUI2KCG7KlhhMqQNpeDiwHmCaNOcIUERNIdvOJiGhjyGqQ/ZrmcyFwDfAySRtKx2pEDJPeLTUcmH6NYr+5H8+JiAFLEzsiookh3A8yATIi6jNkATKbVUREtJAaZETUI9N8IiLaGLImdgJkRNQjNciIiDZSg4yIaCLTfCIi2kgTOyKiidQgIyJaSICMiGgjTeyIiCamWg1S0l91chPbf1NPcSJiUptiNcjfajgX8NvAg8B3gRcDewHf6E3RImJSmWo1SNuvHzmXdDpwJfBR2y5pJwMze1rCiJg8plKAHOUYYK+R4Fj8A1WN8sRaSxURk88UX2r4U6pXJt7SkPYrwM9qLVEhJt9ebEP2yzNqMnfQBejSj55L5h78I5B0L7Cl3H2b7fmSdgM+D+wD3Av8ke0flutPBo4t17/H9uUlfR5Pvxf768AJoyp8z9JNDDoLuEzShyS9TdKHykPO6uIeETGsRmqQ3Ryd+x3bB9seeT/2ScAa23OBNeUzkvYHlgAHAIuAsyRNK3nOpnqt9NxyLBrroR0HSNsfBd4HvKr8fDXwftsf6fQeERE1WQysKOcrgCMb0lfa3mr7HmA9sEDSLGCG7WtKrfH8hjwtdTUP0vY/A//cTZ6ImEK6b2LPlLS24fNy28tHXWPgCkkGPlW+39P2RgDbGyXtUa6dDVzbkHdDSXuinI9Ob6urACnpl6mqry+0/S5J+wHb217XzX0iYgiNb5rP5oZmcyuH2H6gBMHVku5sc61alKxVelsdN7ElvR74NrCQakQbYHfgY53eIyKGXA/6IG0/UH5uAr4MLAAeKs1mys9N5fINwN4N2ecAD5T0OU3S2+pmkOY04CjbR/D074mbgF/r4h4RMaxGapDdHGOQ9AJJvzByDvwucBtwMbC0XLYUuKicXwwskbSjpH2pBmOuL83xLZIWShJVJe8ixtBNE/slti8r5waw/VNJ23dxj4gYVr1ZSbMn8OUqprEd8C+2L5N0A7BK0rHAfcBRALbXSVoF3A5sA463PVKq43h6ms+l5WirmwB5v6QDbd82kiDpIKo5SBERtU8Ut/0d4KAm6Q8Dh7bIcypwapP0tVRzuTvWTRP7TOBLko4Gpkn6n8AFwD9288CIGFI9aGIPWsc1SNufLm339wPTgA8BZ5SpPxERU3qpIWX+0eg5ShERQ7mbTzfTfO5okX5rfcWJiEltqjaxeeYcok7SI2IqmYq7+Uj6wMi1DecjXgrcX3upImJymgS1wm50UoMc2TR3+4ZzqH5XPAj877oLFRGT0BD2QY4ZIG3/DoCkf7L97t4XKSImrSFrYnc1D1LSXo0JkvaU9NKayxQRk9EQzoPsJkD+C89+/8zuJT0iopcb5g5ENwFyv8ZlhsU6YL+xMkraW9JVku6QtE7SCV2VMiJiALqZ5vOIpJm2NzekzQQe6yDvNuBE2zeVnTlulLTa9u3dFDYiJrAhHKTppga5Gjhb0s4A5ec/AVeMldH2Rts3lfMtwB10sJtvREwyQ9YH2U0N8iSqvdYelrQJ2AO4ETiimwdK2gd4JXBdk++WUb1Up+n2vxExgU3FieIjbG+WdAjw68CLqbY5WzvWaxMblVrnF4H32n60yTN+vtZ7u+r9ExExmUyCWmE3ut2swsD15ehK2Vj3i8DnbH+p2/wRMcENYR9k2wAp6Uzb7ynnLXfxsb1sjPsIOBe4w/bp4yloREwCU6yJvX2L824dArwFuFXSLSXtA7a//hzuGRETyVSrQdo+ruH8beN9iO1vknGXiOE2lQdpIiLGNJVqkJKeooOXa9ueVluJImJymmpNbOC3Gs7nA38CfBy4B/hl4L3Ap3pTtIiYdIasid12JY3tb40cwFuBw22fY3uN7U9TTRIfd99kRAyRHu7mI2mapJslXVI+7yZptaS7y89dG649WdJ6SXdJOqwhfZ6kW8t3Z5bZNW11s9TwJTx79/DvUdUkIyJ6udTwBKolyiNOAtbYngusKZ+RtD+wBDgAWAScJWmkC/BsqpV6c8uxaKyHdhMgbwQ+JmmnUpCdgNOAm7u4R0QMq5FR7Jq3O5M0B3gDcE5D8mJgRTlfARzZkL7S9lbb9wDrgQWSZgEzbF9TFryc35CnpW5Gsd8BfBX4YcNa7O/S5VrsiBhi3Q/SzJS0tuHz8rLkuNEZwF8Av9CQtqftjVBthiNpj5I+G7i24boNJe2Jcj46va1u1mKvl3QgsLDc+HvAtbaHbNwqIsZlfPMgN9ue3+pLSYcDm2zfKOk1HdyvWb+i26S31e1a7Ccl/Qew10j0jojooUOAIyT9PrATMEPSBcBDkmaV2uMsYFO5fgOwd0P+OcADJX1Ok/S2Ou6DlLSzpHOBn1K165F0pKRTOr1HRAy5mgdpbJ9se47tfagGX660fTTV1otLy2VLgYvK+cXAEkk7StqXajDm+lKh2yJpYRm9PqYhT0vdDNJ8HNiTKqI/XtJuAN7UxT0iYlj196VdpwGvl3Q31euoTwOwvQ5YBdwOXAYc39ANeBzVQM964L+BS8d6iDrdzlHS94D9bf9I0g9s71bSH7G9Szd/sk5sJ3nnum/aY08MugAxIf3GoAvQpbXAo3bXeyfMf4G89uXd5dFN3NiuD3LQuumDFFXz+umEagPcH9daooiYnIZwqWE3TexvASePSns3cFV9xYmISWsI34vdTQ3yRGCNpKOBnSXdSrVH5KE9KVlETD5Dtha7m3mQ95V5kIcD+1JNEr/E9k/b54yIKWEIm9gdBUhJ2wEPU81e/2Jvi1QxGfSI4bB27EsmlE5edN/SVKxB2t4maTNVk/pnvS1SRExKQ1iD7GaQ5hTgbEljrl+MiClqCg/SfBaYBrx59E7jtneou2ARMclM1XfSSHop1YqZXahmoEdEPNskqBV2Y8wAKemNwOepao+PA2/M61oj4lmmaB/kB4EPUO3Fdko5j4h4th5smDtInQTIfYGP234MOB14aW+LFBGT0hRdSTPN9lMAtp+QlAGZiGhuEtQKu9FJgNxBUmOzeqdRn7H9kXqLFRGTzhD2QXYSIK+l2m9txHWjPhtIgIyIoTNmgLT9mj6UIyKGwRSsQUZEjG2qThSPiOhIapAREU1M0UGaiIjOpIkdEdHckFUgEyAjoh5D2MLuaj/IiIi26l6KLWknSddL+rakdZI+VNJ3k7Ra0t3l564NeU6WtF7SXZIOa0ifJ+nW8t2ZksZ8tW0CZETUokdLsbcCr7V9EHAwsEjSQuAkYI3tucCa8hlJ+wNLgAOARcBZkqaVe50NLAPmlmPRWA/vS4Bs9VsgIoZL3TVIV35cPm5fDgOLgRUlfQVwZDlfDKy0vdX2PcB6YIGkWcAM29fYNnB+Q56W+lWDbPVbICKGxDhrkDMlrW04lo2+r6Rpkm4BNgGrbV9H9QLBjQDl5x7l8tnA/Q3ZN5S02eV8dHpbfRmkKRG72W+BiBgS4xyk2Wx7ftv72k8CB0vaBfhyef10K836Fd0mva2+9UG2+C0w+pplI79JEj0jJp9e7pdr+xHgaqq+w4dKs5nyc1O5bAOwd0O2OcADJX1Ok/S2+hYgbT9p+2Cqgi1o9lvA9nLb823PH3N4KSImlF4M0kjavdQckTQdeB1wJ3AxsLRcthS4qJxfDCyRtKOkfakGY64vzfAtkhaW0etjGvK01Pd5kLYfkXQ11W+B2/r9/IjonR7Mg5wFrCgj0c8DVtm+RNI1wCpJxwL3AUcB2F4naRVwO7ANOL400QGOA84DpgOXlqMtVd2DvSVpd+CJEhynA1cAf2f7klZ5pkneqecli+i9aWNfMqE8Bjxpd92I+1XJX+syz4vgxrH6IAepXzXIpr8F+vTsiIhx6dco9n8Cr+zHsyJicIZtqWHWYkdELYZwv9wEyIioT2qQERFNDONuPgmQEVGbNLEjIppIDTIiooUEyIiINtLEjohoIjXIiIg2UoOMiGgiNciIiDYSICMimshSw4iINlKDjIhoIn2QERFtDFsTu2/vpImImGxSg4zoscMGXYAurRlnvjSxIyLaGLYmdgJkRNQiNciIiDaGLUBmkCYiajEyUbybYyyS9pZ0laQ7JK2TdEJJ303Sakl3l5+7NuQ5WdJ6SXdJOqwhfZ6kW8t3Z0oa89W2CZARUZsnuzw6sA040fYrgIXA8ZL2B04C1tieSzWudBJA+W4JcACwCDirvG4a4GxgGTC3HIvGengCZETUYqQPss4AaXuj7ZvK+RbgDmA2sBhYUS5bARxZzhcDK21vtX0PsB5YIGkWMMP2NbYNnN+Qp6X0QUZEbcYxij1T0tqGz8ttL292oaR9gFcC1wF72t4IVRCVtEe5bDZwbUO2DSXtiXI+Or2tBMiIqMU4R7E3254/1kWSdga+CLzX9qNtug+bfeE26W2liR0RtejFIA2ApO2pguPnbH+pJD9Ums2Un5tK+gZg74bsc4AHSvqcJultJUBGRG3q7oMsI83nAnfYPr3hq4uBpeV8KXBRQ/oSSTtK2pdqMOb60hzfImlhuecxDXlaShM7ImrRo4nihwBvAW6VdEtJ+wBwGrBK0rHAfcBRALbXSVoF3E41An687ZFiHQecB0wHLi1HWwmQEVGbupca2v4mzfsPAQ5tkedU4NQm6WuBA7t5fgJkRNQiSw0jItpIgIyIaGIY30mTUeyIiBZSg4yI2qSJHRHRxDA2sfsaIMuuGmuB79k+vJ/PjojeSw3yuTmBajeOGX1+bkT02DBO8+nbII2kOcAbgHP69cyI6K9erMUepH7WIM8A/gL4hVYXSFpGtaFly6nzETExpQY5TpIOBzbZvrHddbaX255ve34CZMTk0osNcwetXzXIQ4AjJP0+sBMwQ9IFto/u0/Mjog8mQ7O5G32pQdo+2fYc2/tQvS/iygTHiOGSGmRERBvDVoPse4C0fTVwdb+fGxG9NYyDNKlBRkRtEiAjIprIUsOIiDZSg4yIaCJ9kBERbaSJHRHRxDDWILOjeERECwmQEVGbunfzkfQZSZsk3daQtpuk1ZLuLj93bfjuZEnrJd0l6bCG9HmSbi3fnSmpo+0eEiAjohY9Wmp4HrBoVNpJwBrbc4E15TOS9qdaynxAyXNW2aQb4GyqncLmlmP0PZtKgIyI2tQdIG3/O/CDUcmLgRXlfAVwZEP6Sttbbd8DrAcWSJoFzLB9jW0D5zfkaSuDNBFRi3FOFJ8paW3D5+W2l4+RZ0/bGwFsb5S0R0mfDVzbcN2GkvZEOR+dPqYEyIiozThGsTfbnl/T45v1K7pN+pgSICOiFn2c5vOQpFml9jgL2FTSNwB7N1w3B3igpM9pkj6m9EFGRC1Gmth9eCfNxcDScr4UuKghfYmkHSXtSzUYc31pjm+RtLCMXh/TkKet1CAjojZ11yAlXQi8hqqvcgNwCnAasErSscB9wFEAttdJWgXcDmwDjrc9UqTjqEbEpwOXlmPs51eDOhPPNMk7DboQETX4/UEXoEtrgB/YXb8W6hcl/2aXeb4ON9bYB1m71CAjojbDttRwwgbIp2DzT+C7Pbj1TGBzD+7bS5OtzJOtvNDDMn+hFzft7d/xi8eTaRjXYk/YAGl7917cV9LaiVylb2aylXmylRcmX5knanmzm09ERBOpQUZEtJEAOfmNtYxpIppsZZ5s5YXJV+YJV95hfCfNhJ3mExGTy86SD+wyz3UTfJpPVtJERLQwFZvYEdEDw9jEnjI1SEmLyi7D6yWdNOjyjKXZTsoTnaS9JV0l6Q5J6ySdMOgytSNpJ0nXS/p2Ke+HBl2mTkmaJulmSZcMuiyNerBh7kBNiQBZdhX+JPB7wP7Am8vuwxPZeXS46/EEsg040fYrgIXA8RP873kr8FrbBwEHA4skLRxwmTp1AnDHoAvRqEc7ig/UlAiQwAJgve3v2H4cWEm1+/CE1WIn5QnN9kbbN5XzLVT/gDvamHQQXPlx+bh9OSb8qKWkOcAbgHMGXZbR+rSbT99MlQA5G7i/4XPHOwrH+EjaB3glcN1gS9JeaareQrWn4GrbE7q8xRnAXzDBYkxqkJPXuHcUju5J2hn4IvBe248Oujzt2H7S9sFUm6gukNTtTJW+knQ4sMn2jYMuy2gJkJNXq52Go2aStqcKjp+z/aVBl6dTth8Brmbi9/seAhwh6V6qrqLXSrpgsDsyT7sAAAMoSURBVEV6WprYk9MNwFxJ+0ragerVkBcPuExDp+zWfC5wh+3TB12esUjaXdIu5Xw68DrgzsGWqj3bJ9ueY3sfqv+Pr7R99ICLBaQGOWnZ3ga8C7icauBgle11gy1Ve2Un5WuAl0naUHZPnugOAd5CVau5pRwTeb/YWcBVkv6T6pfoatsTatrMZDNsNcgsNYyIWuwgea8u89w/wZcaZiVNRNRmMjSbu5EAGRG1GMalhgmQEVGb1CAjIpoYxh3Fp8QodkT0Ry9GsQe50UxqkDEhSToP2Gb77YMuS3SmFzXIho1mXk+14OMGSRfbvr3mRzWVGmQ8g6QPSrKkY7rIY0ndvjM+ohMD3WgmNcj4OUnPA46l2kXoncD5gy1RTCZPweVbqvd1d2MnSWsbPi+33fi+nWYbzfzGeMvYrQTIaHQY1Tr1I4FLJB1o+zYASb8K/D0wD5hGNcH39ZK+XfJeIekpYKXtt5e1wh+0fUHJvw9wD7C37Q2SDgU+AuxHtY/kGuA9tjf1548adbPdi3XsA91oJk3saPRO4FLbXwO+DSwDkDQL+EY59gH2Av4OoGw2C/C7tnfuos9wK9Xyz92BXwFeCHyinj9GDJGBbjSTABkASHoh1SasnylJnwHeUjZxeAtVP9BHbT9m+3Hb//Zcnmf7m7ZvsL3N9oNUtdNDn8s9YygNdKOZBMgYMdL3OLJZwwXAdOBNVLXG/6rzYZLmSbpc0oOSHgUupKpNRvzcoDeaSYCMkcGZtwO7ABskPQjcTtXXuAy4F5jb5hbN+oR+DLyg4fMLR32/ErgJ2M/2DODN4yp8DD3bX7e9n+2X2D61n89OgAyoNomdA7ya6uVVI8cbgFdRvTbhZZLeL+n5krYvgywjHuTZAXQt1cvRdpa0O/CXo76fAfwI2CLpRcCEf9NkTD0JkAHV4MxXbN9o+8GG4wqqPSmPAl7D05N1HwLe35D//wJ/I+mHkj5V0j5INW94I9VO3StHPXMZVa11C/Al4F978QeLeC6yH2RERAupQUZEtJAAGRHRQgJkREQLCZARES0kQEZEtJAAGRHRQgJkREQLCZARES38f0TnyDeH8MSaAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n", "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n", "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n", "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)\n", " if __name__ == '__main__':\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "loss: 1.3225790266836843, accuracy: 0.333766129032258, f1: 0.16503508884653428, precision: 0.34857432644627095, recall: 0.4273865645595446\n", "val mode | time: 09:01:09\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAU4AAAErCAYAAACxamqAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3de7hdVX3u8e9LiCEWwy2AMUFDTVCBI+EkTaNYS4lKVCpooUaPEms0iqDoQQt4rDfESxVQrPAYhSagElK8gAhiSlAfLAR2uAgJULcSYUMghmugEgn8zh9jLFzZ2XvtNRdz3fZ6P88znz3XWHPMOXYgv4wxx00RgZmZ1W+7dhfAzKzbOHCamRXkwGlmVpADp5lZQQ6cZmYFOXCamRXkwGlmHUvSDpKuk3SzpDWSPpPTPy3pHkk35eMNVXlOltQv6Q5Jh1alz5R0S/7uTEnK6eMkXZjTV0maOlK5HDjNrJNtBg6JiAOAGcA8SXPyd2dExIx8XAYgaV9gPrAfMA84S9KYfP3ZwCJgej7m5fSFwEMRMQ04A/jSSIVy4DSzjhXJY/nj2HzUmrVzOLAsIjZHxJ1APzBb0iRgQkRcE2nWz3nAEVV5lubzi4C5ldrocLZv7NcxM9vavHnzYuPGjYXyrF69+oqImFfrmlxjXA1MA74REaskvR44TtLRQB9wQkQ8BEwGrq3KPpDTnszng9PJP+8GiIgtkh4BdgOG/WUcOM2sFBs3bqSvr69QHkkvlVSdaXFELK6+JiKeAmZI2hn4oaT9Sc3uU0i1z1OA04B3A0PVFKNGOiN8NyQHTjMrSQBbimbaGBGz6rp7xMOSfg7Mi4ivVNIlfQu4NH8cAPaqyjYFuDenTxkivTrPgKTtgZ2AB2uVxe84zaxEWwoetUnaPdc0kTQeeA1we35nWfFm4NZ8fgkwP/eU703qBLouItYDmyTNye8vjwYursqzIJ8fCayMEVY/co3TzErSUI1zJJOApfk953bA8oi4VNL5kmbkh64D3gcQEWskLQfW5sIcm5v6AMcAS4DxwOX5ADgHOF9SP6mmOX+kQsnLyplZGWbNOjD6+q4qlEfaZXW9TfVO4hqnmZWkKTXOjuR3nE0m6S2SVkp6WNJmSf8t6XOSJjbpeQdJukHSE5JKa07kmRrFxpp0MEmLJB0x8pXPXL9kUO+vbaMSOMt7x9mpXONsIkmnAR8G/p00I+FRYF/g/aSZDW9uwmO/CWwADiXNuijLt4Efl3i/dltE6lD4UZ3Xn0J6N2bD6p0apwNnk0j6e+D/Agsj4tyqr34haTHwuiY9+qWksXC/KPOmETHA1gOIe4Kk8RHxx4j4bbvL0h16I3C6qd48HwFuGBQ0gTSgNyIqPXpImihpqaQHJP2PpJ9L2uqFuaR1kr4i6SOSBiQ9JGlZ1VCNg3PTfAzwNUkhaUn+LiQdN+h+WzW9Je0s6duS7s3N/Lvy+Lghr89pe0v6kaRHJW2S9GNJ0wZdE5KOl/R5SX+QtEHSNySNq/WHV2kaS3qjpLX5z+UnknaVNE3SVZIez9e8fFDeEyRdL+kRSfcPLlceCzgTWJDLF5LeVfXnfJqkf5E0QGolbNNUl3SppNvzEJnq5z4hab9av9voFcBTBY/u5MDZBJLGAq8Eflpnlh+RmtYfBd5K+u9y1eAgBPwjMJfUzDwROAz4fP7uBuAV+fy0fH5KgWKfDryKFPAPBT5OjdkTOfBdCbwMeC/wLmBvUo1610GXnwC8AHgH8GXS0JHj6yjTC4HPAp8g/c6vBBYDy/JxJKnVtCyPzauYAvwbaQ7ye0n/mPxK0k75+w8AtwOXkf6cXgH8pCr/24G/zde9dZiyvRfYHfgCgKSXAZ8DPhURa+r43UYhv+O0Z2c3YBxw10gXSpoHHAQcXGleS1pJGpv2MfL4tOxJ4IiI2JKvq6wE84GIeBS4NsePdRFRPV+3HrNJ84AvrEr7To3r/4kU2PaJiN/l8qwCfpfL/IWqa9dFxLvy+RWSDgLeAvzrCGXaFXhFpZmca5YfAxZExHk5TaSg91LgNoCI+EjlBnn83wrSe9/DgfMiYq2kx4E/1PhzOiwinhiuYBGxPtfivyvpx/n3vRH4ynB5Rr/eecfpGmdz1dOrPZv0F/iZd5IR8ThpCtmrBl17VSVoZmuBPSQ951mXFG4CPibpA5L2qeP62aRXEb+rJOT3oL9i23L/bNDntWw9/W046wa9W+zPP1cOkVZZsAGl2SErJD1A+pv8P8COQD2/F8CVtYJmRURcAHyfFLj3JwX07m1/Wt0cOJvjAVKP9gvruHYScP8Q6feTalzVHh70+U+kBQrKCJzHkV4ZfBK4Q9JvJNWaQfFsy71DHWUaKt/g9EraDgCSXkgK1CLVfA8C/opU46znmTD07zWcC0itixUR8ZsC+Uap3miqO3A2QUQ8Sap5HTrStcB6YI8h0vdkhIUGCtjMtsF1q+AWEQ9HxIci4vnAAcAqUjN032Hu2YpyN2Ie8Fzg8Ii4KCL+i1SbHhzMa6lr/KukCaRhZjcCb1LVauO9qXfecTpwNs9XgVmSFgz+QtJ2+d0mpAC1h6RXV33/XOCNwNUllWWA1InzzPOBQ4a7OCJ+TXqXuB3p3eFQVgEzlRZSqNx3MqkDp6xyN2I88DRb/638R7Z9n19vrbeWr5I6ng4Bvgd8u6oDqgf1TuB051CTRMSPJZ0OnJM7Qy4GHiMFoveTOn9+GhFXSPoVcKGkk0jN/I+SAsCXSyrOD4FjJd1I6rx5DzCh+gJJV+frbiX9DXgv8Dhw3TD3XELq2b9c0idJY0s+TVr89ZsllbsRK0nB7N8lnUOaaPBRtm323w4cmmuJDwB3RsQD9T5E0mGkDrLX5+XOPkj6s/saaYRBD3LnkJUgIk4gDWeZTqqRrCANzbmStFJLxZvzd18F/oP0fu6QiOinHJ/J9/0cKeDdBAweX3oN6S/8RcByYCIpKAw56D0iNpOX+CKtLrMU+D1pdEDbmuoRcQspoP01qYPt7cBRwCODLv0cqRd+OXA98Pf1PiMPt1oMfCsifpqf+yDpH5sFSpMfelDv1Di9OpKZlWLWrJdEX99ZhfJIr/HqSGbWy3qnqe7AaWYlceA0MyvIgdPMrCAHTjOzBjhwttXE8YqpXTaU+P4iE/U6wH3tLkADnm53ARpw4MyZ7S5CIevWrWPjxo1D7TU+Atc4227qTtB3dLtLUcxXyhqu3iIjLU3UiR5vdwEa0NfXXTtuzJrV6OggB04zs4IqCxmPfg6cZlYS1zjNzBrQG4HTc9XNzApyjdPMSuKmuplZQQ6cZmYFOXCamRXkwGlm1oDeCJzuVTezkpS/ArykHSRdJ+lmSWskfSan75q3gP5N/rlLVZ6TJfVLuqN6Az1JMyXdkr87U5Jy+jhJF+b0VZKmjlQuB04zK0lTts7YTNpG5gBgBjBP0hzgJODKiJhO2ormJIC8K+t80l5T84CzJI3J9zobWETaymZ6/h5gIfBQREwj7Vr6pZEK5cBpZiUpP3BG8lj+ODYfARxO2ueK/POIfH44sCwiNkfEnUA/MFvSJGBCRFwTab+g8wblqdzrImBupTY6HAdOMytJczZrkzRG0k3ABmBFRKwC9oyI9QD55x758snA3VXZB3La5Hw+OH2rPBGxhbSx3261yuTOITMrUeHOoYmSqpePWhwRi6sviIingBmSdgZ+KGn/GvcbqqYYNdJr5RmWA6eZlaSh4Ugb693lMu9f/3PSu8n7JU2KiPW5Gb4hXzYA7FWVbQpwb06fMkR6dZ4BSdsDOwE1t7h2U93MStKUXvXdc00TSeOB1wC3A5cAC/JlC4CL8/klwPzcU743qRPoutyc3yRpTn5/efSgPJV7HQmsjBH2TW9ZjVPSPOBrwBjg2xHxxVY928xaoSkD4CcBS3PP+HbA8oi4VNI1wHJJC4G7gKMAImKNpOXA2lyYY3NTH+AYYAkwHrg8HwDnAOdL6ifVNOePVKiWBM78S38DeC2pWny9pEsiYm0rnm9mrVLuQsYR8WvgwCHSHwDmDpPnVODUIdL7gG3ej0bEE+TAW69W1ThnA/0R8TsASctIQwAcOM1GDU+5LNtQQwT+ukXPNrOW6J3A2arOobq6+yUtktQnqe8Pf2xBqczMGtCqGudwQwS2ksdvLQaY9XzV7NUys07TOzXOVgXO64HpeXjAPaReq7e36Nlm1hIOnKWKiC2SjgOuIA1HOjci1rTi2WbWKg6cpYuIy4DLWvU8M2sHB04zswJc4zQzK8iB08ysIAdOM7OCHDjNzBrgwGlmVoBrnGZmBTlwmpkV5MBpZlaQA6eZWQPKXci4UzlwmllJXOM0MyuodwKnd7k0MyvINU4zK0nv1DgdOM2sRA6cZmYFuMZpZlaQA2fb3Xw/7PHldpeimCfaXYCCemPEXftN1VCbvHau9Q3ndOA0MysueuOfYwdOMyvP0+0uQGs4cJpZOYKeef/jwGlm5XDgNDNrQI801T3l0szKUalxFjlGIGkvSVdJuk3SGknH5/RPS7pH0k35eENVnpMl9Uu6Q9KhVekzJd2SvztTSsMdJI2TdGFOXyVp6kjlcuA0s/I8XfAY2RbghIh4GTAHOFbSvvm7MyJiRj4uA8jfzQf2A+YBZ0kak68/G1gETM/HvJy+EHgoIqYBZwBfGqlQDpxmVo4m1DgjYn1E3JDPNwG3AZNrZDkcWBYRmyPiTqAfmC1pEjAhIq6JiADOA46oyrM0n18EzK3URofjwGlm5SkeOCdK6qs6Fg1369yEPhBYlZOOk/RrSedK2iWnTQburso2kNMm5/PB6VvliYgtwCPAbrV+TQdOMytH0EhTfWNEzKo6Fg91a0k7At8HPhwRj5Ka3S8GZpAmO51WuXSYkg2XXivPsBw4zaw8JTfVASSNJQXN70bEDwAi4v6IeCoinga+BczOlw8Ae1VlnwLcm9OnDJG+VR5J2wM7AQ/WKpMDp5mVozm96gLOAW6LiNOr0idVXfZm4NZ8fgkwP/eU703qBLouItYDmyTNyfc8Gri4Ks+CfH4ksDK/Bx2Wx3GaWSc7CHgncIukm3Lax4G3SZpBCtfrgPcBRMQaScuBtaQe+WMjnplAfwywBBgPXJ4PSIH5fEn9pJrm/JEKpRECa9uMlWKXkS/rKF4dyYaye7sLUNB6YHNE4SWdZr1c0XdJsTzam9URMavos9rNNU4zK4enXJqZNaBHplw6cJpZOVzjNDMrqIcCZ0uGI+WR/Rsk3Try1WbWtcqfq96RWjWOcwl/nlBvZqNRE8ZxdqqWNNUj4pf1LNVkZl2ui4NhEX7HaWblqMxV7wEdFTjzyiiLwHNBzbqSa5ytl1dGWQxp5lCbi2NmRbjGaWbWgB6pcbZqONIFwDXASyQNSFrYiueaWQu5V71cEfG2VjzHzNrMTXUzswJ6aOaQA6eZladHAqdH/ZiZFeQap5mVw8ORzMwa0CNNdQdOMyuHa5xmZg1wjdPMrAAPRzIza4Cb6mZmBbjGaWZWkAOnmVkD3FQHSZ+s5yYR8dlyimNmXcs1zmf8TdW5gFcD9wG/B14EPB/4RXOKZmZdxzVOiIjXVs4lnQ6sBL4QEZHTTgYmNrWEZtYdXOMc0tHA8ytBM/syqQZ6QqmlMrPu1COBs8jqSH8E9h+U9r+AJ8orjpl1rcqUyyLHCCTtJekqSbdJWiPp+Jy+q6QVkn6Tf+5SledkSf2S7pB0aFX6TEm35O/OlKScPk7ShTl9VT1bmRepcZ4F/FTSN4F1wFTSjpRfL3CPum0HPLcZN26ibvsXZEy7C9CAbqzQvKTdBSjooWeTufz/QFuAEyLiBknPA1ZLWgG8C7gyIr4o6STgJOBESfsC84H9gBcA/ylpn4h4CjibFLOuBS4D5gGXAwuBhyJimqT5wJeAt9YqVN01zoj4AvAx4BX55yuBEyPi8/Xew8xGsSbUOCNifUTckM83AbcBk4HDgaX5sqXAEfn8cGBZRGyOiDuBfmC2pEnAhIi4Jr9uPG9Qnsq9LgLmVmqjwyk0jjMizgfOL5LHzKwMuQl9ILAK2DMi1kMKrpL2yJdNJtUoKwZy2pP5fHB6Jc/d+V5bJD0C7AZsHK4shQKnpL8kVYNfEBHHSdoHGBsRa4rcx8xGqeJN9YmS+qo+L46IxYMvkrQj8H3gwxHxaI0K4VBfRI30WnmGVXdTXdJrgZuBOaQedoDdga/Uew8zG8Ua2x54Y0TMqjqGCppjSUHzuxHxg5x8f25+k39uyOkDwF5V2acA9+b0KUOkb5VH0vbATsCDtX7VIr3qXwSOiog38ed/V24A/neBe5jZaFZ+r7qAc4DbIuL0qq8uARbk8wXAxVXp83NP+d7AdOC63KzfJGlOvufRg/JU7nUksHLQsMttFGmqvzgifprPAyAi/pj/NTCzXtecAfAHAe8EbpF0U077OKkit1zSQuAu4CiAiFgjaTmwltQjf2zuUQc4BlgCjCf1pl+e088BzpfUT6ppzh+pUEUC592S9o+IWysJkg4gDU0ys17XhMAZEVcz9DtIgLnD5DkVOHWI9D62HYtORDxBDrz1KtJUPxP4gaR3AGMk/QPwHeCMIg80s1Gs5KZ6p6q7xhkR38rvBk4kjZ3+DPDVPETJzHqd56oPLfd4bdPrZWYGdHUtsogiw5FuGyb9lvKKY2Zdq7HhSF2pSI1zSsF0M+s1XRwMixgxcEr6eOXaqvOKaeSpSmbW4ypz1XtAPTXOymLGY6vOIf0R3Qe8u+xCmVmXco0ziYi/A5D09Yj4YPOLZGZdqYd61QuN45T0/OoESXtKmlZymcysW/XIOM4igfN7bLu/0O453cx6XQ/1qhcJnPtUT7fM1gD7jJRxuOXvzWyUcY1zGw9LGlzjnAg8XkfeyvL3LyMtS3dsXuLezKzrFAmcK4Cz84KilYVFvw78bKSMNZa/N7PRooea6kUGwJ9EWrfuAUkbgD2A1cCbijxw0PL3ZjaadHEwLKLIIh8bJR0E/BXwItJycn0jLfhZbfDy90N8v4i0C11X7sBo1tM8AH5oOUhel49Chln+fvD9n1lEZJxUd0A2sw7hGidIOjMiPpTPh10VKSIWjXCf4Za/N7PRoocGwI9U4xw7zHlRQy5/HxGXPYt7mlmncVMdIuKYqvN/avQhIyx/b2ajgWucZmYFuXMokfQ0I2zMDhAR7gQ3M9c4s7+pOp8FvB84DbgT+Evgw8A3m1M0M+sqbqonEfGryrmkfwMOi4jf5qQrJa0ELiLtgGlmvc5N9W28mG1Xe7+HVPM0s17XQzXOInPVVwNfkbQDQP75ReDGZhTMzLqQ56pv473Aj4GHquaq/56Cc9XNbJRyr/q2IqJf0v6kZeEmk5rp10ZEF/+7YWal6pFoUHSu+lOS/gt4fkSsb1KZzKwb9VCNs+53nJJ2lHQO8EegP6cdIelTzSqcmVknKtI5dBqwJ2ne+Z9y2vXAW8sulJl1qR7pHCoSOA8D/k9ErCbPJoqIe4AXNKNgZtZlmrACvKRzJW2QdGtV2qcl3SPppny8oeq7kyX1S7pD0qFV6TMl3ZK/OzOv2IakcZIuzOmr8kLrIyoSOEVqplf/UjsCjxW4h5mNZuVv1rYEmDdE+hkRMSMflwHkfczmA/vlPGdJqkwHP5u0SPr0fFTuuRB4KCKmAWcAX6qnUEUC56+AkwelfRC4qsA9zGy0akKNMyJ+CTxYZwkOB5ZFxOaIuJPUFzNb0iRgQkRckxdjPw84oirP0nx+ETC3UhutpUiv+gmkaZbvAHaUdAtpjc65Be5hZqNVa2cOHSfpaKCPtIPuQ6RhktdWXTOQ057M54PTyT/vBoiILZIeAXYDNtZ6eJFxnHflcZyHAXuTBr9fGhF/rJ3TzHpG8eFIEyX1VX1enLfQqeVs4BRSqD6F1HH9boZe8zdqpDPCd8OqK3BK2h54ANgzIr5fTx4z6zGN1Tg3RsSsQo+JuL9yLulbwKX54wCwV9WlU4B7c/qUIdKr8wzkOLcTdbwaqCtw5irsRlLT/Il68jxbTwGbWvGgEnXx6AproqvbXYCCntVf8BYMgJc0qWoCzpuBSo/7JcD3JJ1OGu0zHbguT9zZJGkOaVvyo4GvV+VZAFwDHAmsrGfn3iLvOD8FnC3pxDwMyczsz5rwjlPSBcDBpCb9ACkOHSxpRn7iOuB9ABGxRtJyYC2wBTi2akr4MaQe+vHA5fmAtInk+ZL6STXN+XWVq95t0SU9SdruvDKx6pmMEfGcum5SwPZS7FT2TZusJVVxsyZ7AngqovAeYbPGKfomFcuj37O6aFO9E9T7jnMaaYbQzsBvR7jczHpRD81VHzFwSnoLcCGptvkn4C3e1tfMhtQjL/rrGQD/CeDjwPNI7xc+3tQSmVl3asIA+E5VT+DcGzgtIh4HTgemNbdIZta1yp9y2ZHqecc5JiKeBoiIJyWV3hFkZqNAD+05VE/gfI6k6ub5DoM+ExGfL7dYZtaVurgWWUQ9gfNa4LVVn1cN+hyAA6dZr3ON888i4uAWlMPMrGsU2nPIzKwm1zjNzArwAHgzswa4xmlmVoA7h8zMGuCmuplZMT1S4XTgNLNy9FBL3YHTzMrTIy311gROSTsAvwTG5WdeFBGfasWzzaw1XOMs32bgkIh4TNJY4GpJl0fEtSNlNLPu4RpnifLmR4/lj2PzUd+eHWbWFXqpxlnPepylkDRG0k3ABmBFRKxq1bPNrPl6aB3j1gXOiHgqImaQ9jSeLWn/wddIWiSpT1Kfq6Nm3adH1jFuXeCsiIiHgZ8D84b4bnFEzIqIWYW32DOztnKNs2SSdpe0cz4fD7wGuL0Vzzaz1umVwNmqXvVJwFJJY0jBenlEXNqiZ5tZC/TQ4kgt61X/NXBgK55lZtZsnjlkZqXp5uZ3EQ6cZlYKN9XNzBrgGqeZWQG9NHPIgdPMStMrTfWWD4A3s9GpGQPgJZ0raYOkW6vSdpW0QtJv8s9dqr47WVK/pDskHVqVPlPSLfm7MyUpp4+TdGFOXyVpaj2/qwOnmZWiSTOHlrDtLMOTgCsjYjpwZf6MpH2B+cB+Oc9Zeew4wNnAImB6Pir3XAg8FBHTgDOAL9VTKAdOMytN2XPVI+KXwIODkg8HlubzpcARVenLImJzRNwJ9JPWxZgETIiIa/JKbecNylO510XA3EpttBa/4zSzUrSwc2jPiFgPEBHrJe2R0ycD1Wv8DuS0J/P54PRKnrvzvbZIegTYDdhYqwAOnGZWmgY6hyZK6qv6vDgiFjf4+KFqilEjvVaemhw4zawUDdY4N0bErIJ57pc0Kdc2J5HW+IVUk9yr6ropwL05fcoQ6dV5BiRtD+zEtq8GtuF3nGZWmhatjnQJsCCfLwAurkqfn3vK9yZ1Al2Xm/WbJM3J7y+PHpSncq8jgZX5PWhNrnGaWSmaMeVS0gXAwaQm/QDwKeCLwHJJC4G7gKMAImKNpOXAWmALcGxEVOLzMaQe+vHA5fkAOAc4X1I/qaY5v65y1RFc22J7KXZqdyEKeqLdBTArwRPAUxGF1xJ/iRRnF8wzF1Y30FRvO9c4zawUnnJpZtYAT7k0M7MhdWyNM0ijVs263XvaXYCCljWYz011M7MG9EpT3YHTzErhGqeZWQMcOM3MCvCeQ2ZmDXCN08ysAL/jNDNrgJvqZmYFuMZpZlaQO4fMzBrgGqeZWQFuqpuZNcBNdTOzAlzjNDNrgAOnmVkBvdSr7oWMzcwKco3TzErjprqZWQG91FRvaeCUNAboA+6JiMNa+Wwzaz7XOJvjeOA2YEKLn2tmTdZLw5Fa1jkkaQrwRuDbrXqmmbXW0wWPbtXKGudXgX8GnjfcBZIWAYsA1KJCmVk5XOMsmaTDgA0RsbrWdRGxOCJmRcQsB06z7lIJnEWObtWqGudBwJskvQHYAZgg6TsR8Y4WPd/MWqCbm99FtKTGGREnR8SUiJgKzAdWOmiajS6ucZqZNcA1ziaJiJ97DKfZ6NOsGqekdZJukXSTpL6ctqukFZJ+k3/uUnX9yZL6Jd0h6dCq9Jn5Pv2SzpTUcFeK56qbWWma2FT/u4iYERGz8ueTgCsjYjpwZf6MpH1JrwP3A+YBZ+WJNwBnk0btTM/HvEZ+R3DgNLOSVKZctmgc5+HA0ny+FDiiKn1ZRGyOiDuBfmC2pEnAhIi4JiICOK8qT2EOnGZWmgZqnBMl9VUdi4a4bQA/k7S66vs9I2I9QP65R06fDNxdlXcgp03O54PTG+LOITMrRYMD4DdWNb+Hc1BE3CtpD2CFpNtrXDvUe8uokd4Q1zjNrDTNaKpHxL355wbgh8Bs4P7c/Cb/3JAvHwD2qso+Bbg3p08ZIr0hDpxmVopm9KpL+gtJz6ucA68DbgUuARbkyxYAF+fzS4D5ksZJ2pvUCXRdbs5vkjQn96YfXZWnMDfVzayT7Qn8MI8c2h74XkT8VNL1wHJJC4G7gKMAImKNpOXAWmALcGxEVGL0McASYDxweT4a4sBpZqUpewB8RPwOOGCI9AeAucPkORU4dYj0PmD/MsrlwGlmpeil1ZEcOM2sNA6cZmYFeM8hM7MGuMZpZlaA33GamRXkprqZWQNc4zQzK8A1TjOzBrjG2WZPw8ZN8Psm3HoisLEJ922mbitzt5UXmljmM5tx0+b+Gb+okUzuHOoAEbF7M+4rqa+OZaw6SreVudvKC91X5k4tr5vqZmYFuMZpZtYAB87Ra3G7C9CAbitzt5UXuq/MHVfeXupVV9q3yMzs2dlRiqJrtq2C1Z34rnYkXgHezKygngmckublDer7JZ3U7vKMRNK5kjZIurXdZamXpL0kXSXpNklrJB3f7jLVImkHSddJujmX9zPtLlO9JI2RdKOkS9tdlooWbw/cVj0ROPOG9N8AXg/sC7wtb1zfyZYA89pdiIK2ACdExMuAOcCxHf7nvBk4JCIOAGYA8yTNaXOZ6nU8cFu7CzFY2XsOdaqeCJykXfH6I+J3EfEnYBlp4/qOFRG/BB5sdzmKiIj1EXFDPt9E+ovd8N7VzRbJY/nj2Hx0/Et/SVOANwLfbndZqjVjs7ZO1SuBc7hN6q1JJE0FDgRWtbckteUm702k7WVXRERHlzf7KvDPdGBr10310aXUzeitNkk7At8HPoxZcGQAAANtSURBVBwRj7a7PLVExFMRMYO0z/ZsSaVs5tUskg4DNkTE6naXZTDXOEef4Tapt5JJGksKmt+NiB+0uzz1ioiHgZ/T+e+VDwLeJGkd6ZXTIZK+094iJQ6co8/1wHRJe0t6DjCftHG9lUhp8+tzgNsi4vR2l2ckknaXtHM+Hw+8Bri9vaWqLSJOjogpETGV9P/xyoh4R5uL9Qw31UeRiNgCHAdcQeqwWB4Ra9pbqtokXQBcA7xE0oCkhe0uUx0OAt5JqgXdlI83tLtQNUwCrpL0a9I/risiomOG93SbXqpxeuaQmZVinBQvKJhnXZfOHOrFuepm1gReHcnMrAEOnGZmBfTS6kgOnGZWml6pcfZEr7qZNV+zetU7cYEeB07rSJKWSOqoudg2srLHcXbqAj0OnLYVSZ+QFJKOLpAnJL2qmeWyztekGmdHLtDjwGnPkLQdsJC0KtP72lwcM+jQBXrcOWTVDiXN4z8CuFTS/hFxK4CklwP/CswExpAGLr9W0s05788kPQ0si4j35LnUn4iI7+T8U4E7gb0iYkDSXODzwD6kdTyvBD4UERta86ta2Z6GKzal/d6L2EFSX9XnxRFRvZ9SRy7Q48Bp1d4HXB4RP8kBcRHwIUmTgF+QAuc/AE8CrwaIiAMkBfC6iLi6wLM2k6bB3kj6y7Yc+BrwtrJ+GWutiGjGAikduUCPm+oGgKQXkBbHPTcnnQu8My9+8U7Se6YvRMTjEfGniPjPZ/O8iLg6Iq6PiC0RcR8pKM99Nve0UakjF+hx4LSKyrvNyiIX3wHGA28FpgL/XebDJM2UdIWk+yQ9ClwA7F7mM6z7deoCPQ6cVukUeg+wMzAg6T5gLeld5iJgHTC9xi2Geuf0GPAXVZ8Hr/+wDLgB2CciJuAmug0jIi6LiH0i4sURcWq7ywMOnJbMI707eiVp07LK8UbgFaTtL14i6URJz5U0NnfuVNzHtoG1jzTmbkdJuwP/Muj7CcAjwCZJLwQ6YmCzWT0cOA1Sp9CPImJ1RNxXdfyMtCboUcDBwGtJL+vvB06syv//gM9KekjSN3PaJ0hD9daTVlZfNuiZi0i13E3AD4D/aMYvZtYMXo/TzKwg1zjNzApy4DQzK8iB08ysIAdOM7OCHDjNzApy4DQzK8iB08ysIAdOM7OCHDjNzAr6/6Vf9etnx4TNAAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "trainer = Trainer(net=attn_model, lr=1e-3, batch_size=96, num_epochs=10)\n", "trainer.run()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": false, "source_hidden": false }, "nteract": { "transient": { "deleting": false } } }, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "7a6183492d0e103ac878e198fb5e468f3d279e98271ee06042fca66727adf0ef" }, "kernel_info": { "name": "python3" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" }, "microsoft": { "host": { "AzureML": { "notebookHasBeenCompleted": true } } }, "nteract": { "version": "nteract-front-end@1.0.0" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 0 } ================================================ FILE: experiments/ecg_cnn/config.yaml ================================================ # Basic configuration file for running ecg_cnn example using json files. # Parameters needed to initialize the model model_config: model_type: SuperNet # class w/ `loss` and `inference` methods model_folder: experiments/ecg_cnn/model.py # file containing class # Configuration for differential privacy dp_config: enable_local_dp: false # whether to enable user-level DP # Additional privacy metrics privacy_metrics_config: apply_metrics: false # cache data to compute additional metrics # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx) strategy: DGA # Determines all the server-side settings for training and evaluation rounds server_config: wantRL: false # whether to use RL-based meta-optimizers resume_from_checkpoint: false # restart from checkpoint if file exists do_profiling: false # run profiler and compute runtime metrics optimizer_config: # this is the optimizer used to update the model type: sgd lr: 1.0 annealing_config: # annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 50 # how many iterations between metric eval on val set rec_freq: 500 # how many iterations between metric eval on test set initial_val: true initial_rec: true max_iteration: 2000 # how many iterations in total num_clients_per_iteration: 25 # how many clients per iteration data_config: # where to get val and test data from val: batch_size: 10000 val_data: test_data.hdf5 test: batch_size: 10000 test_data: test_data.hdf5 type: model_optimization aggregate_median: softmax # how aggregations weights are computed softmax_beta: 20.0 initial_lr_client: 0.001 # learning rate used on client optimizer lr_decay_factor: 1.0 weight_train_loss: train_loss best_model_criterion: loss fall_back_to_best_model: false # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: do_profiling: false # run profiling and compute runtime metrics ignore_subtask: false data_config: # where to get training data from train: batch_size: 96 list_of_train_data: train_data.hdf5 desired_max_samples: 87000 optimizer_config: # this is the optimizer used by the client type: sgd lr: 0.001 # this is overridden by `initial_lr_client` momentum: 0.90 type: optimization ================================================ FILE: experiments/ecg_cnn/dataloaders/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from experiments.ecg_cnn.dataloaders.dataset import Dataset from core.dataloader import BaseDataLoader import torch class DataLoader(BaseDataLoader): def __init__(self, mode, num_workers=0, **kwargs): args = kwargs['args'] self.batch_size = args['batch_size'] dataset = Dataset( data=kwargs['data'], test_only=(not mode=='train'), user_idx=kwargs.get('user_idx', None), file_type='hdf5', ) super().__init__( dataset, batch_size=self.batch_size, shuffle=(mode=='train'), num_workers=num_workers, collate_fn=self.collate_fn, ) def collate_fn(self, batch): x, y = list(zip(*batch)) return {'x': torch.tensor(x), 'y': torch.tensor(y)} ================================================ FILE: experiments/ecg_cnn/dataloaders/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import h5py import numpy as np from core.dataset import BaseDataset class Dataset(BaseDataset): def __init__(self, data, test_only=False, user_idx=0, **kwargs): self.test_only = test_only self.user_idx = user_idx # Get all data self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data) if self.test_only: # combine all data into single array self.user = 'test_only' self.features = np.vstack([user_data['x'] for user_data in self.user_data.values()]) self.labels = np.hstack([user_label['x'] for user_label in self.user_data_label.values()]) else: # get a single user's data if user_idx is None: raise ValueError('in train mode, user_idx must be specified') self.user = self.user_list[user_idx] self.features = self.user_data[self.user]['x'] self.labels = self.user_data_label[self.user]['x'] def __getitem__(self, idx): items = self.features[idx].astype(np.float32).T.reshape(1,187) return items, self.labels[idx] def __len__(self): return len(self.features) def load_data(self,data): '''Load data from disk or memory''' if isinstance(data, str): try: data = h5py.File(data, 'r') except: raise ValueError('Only HDF5 format is allowed for this experiment') users = [] num_samples = data['num_samples'] features, labels = dict(), dict() # Decoding bytes from hdf5 decode_if_str = lambda x: x.decode() if isinstance(x, bytes) else x for user in data['users']: user = decode_if_str(user) users.append(user) features[user] = {'x': data['user_data'][user]['x'][()]} labels[user] = {'x': data['user_data_label'][user][()]} else: users = data['users'] features = data['user_data'] labels = data['user_data_label'] num_samples = data['num_samples'] return users, features, labels, num_samples ================================================ FILE: experiments/ecg_cnn/model.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. '''The model architecture used was first created by the user polomarco for a Kaggle competition: https://www.kaggle.com/polomarco/ecg-classification-cnn-lstm-attention-mechanism However, this example has been altered to fit the FLUTE architecture''' import torch from torch import nn from torch.nn import functional as F from core.model import BaseModel # ReLu alternative class Swish(nn.Module): def forward(self, x): return x * torch.sigmoid(x) class ConvNormPool(nn.Module): """Conv Skip-connection module""" def __init__( self, input_size, hidden_size, kernel_size, norm_type='bachnorm' ): super().__init__() self.kernel_size = kernel_size self.conv_1 = nn.Conv1d( in_channels=input_size, out_channels=hidden_size, kernel_size=kernel_size ) self.conv_2 = nn.Conv1d( in_channels=hidden_size, out_channels=hidden_size, kernel_size=kernel_size ) self.conv_3 = nn.Conv1d( in_channels=hidden_size, out_channels=hidden_size, kernel_size=kernel_size ) self.swish_1 = Swish() self.swish_2 = Swish() self.swish_3 = Swish() if norm_type == 'group': self.normalization_1 = nn.GroupNorm( num_groups=8, num_channels=hidden_size ) self.normalization_2 = nn.GroupNorm( num_groups=8, num_channels=hidden_size ) self.normalization_3 = nn.GroupNorm( num_groups=8, num_channels=hidden_size ) else: self.normalization_1 = nn.BatchNorm1d(num_features=hidden_size) self.normalization_2 = nn.BatchNorm1d(num_features=hidden_size) self.normalization_3 = nn.BatchNorm1d(num_features=hidden_size) self.pool = nn.MaxPool1d(kernel_size=2) def forward(self, input): conv1 = self.conv_1(input) x = self.normalization_1(conv1) x = self.swish_1(x) x = F.pad(x, pad=(self.kernel_size - 1, 0)) x = self.conv_2(x) x = self.normalization_2(x) x = self.swish_2(x) x = F.pad(x, pad=(self.kernel_size - 1, 0)) conv3 = self.conv_3(x) x = self.normalization_3(conv1+conv3) x = self.swish_3(x) x = F.pad(x, pad=(self.kernel_size - 1, 0)) x = self.pool(x) return x class RNN(nn.Module): """RNN module(cell type lstm or gru)""" def __init__( self, input_size, hid_size, num_rnn_layers=1, dropout_p = 0.2, ): super().__init__() self.rnn_layer = nn.LSTM( input_size=input_size, hidden_size=hid_size, num_layers=num_rnn_layers, dropout=dropout_p if num_rnn_layers>1 else 0, bidirectional=False, batch_first=True, ) def forward(self, input): outputs, hidden_states = self.rnn_layer(input) return outputs, hidden_states class Net(nn.Module): def __init__( self, input_size=1, hid_size=64, n_classes=5, kernel_size=5, ): super().__init__() self.rnn_layer = RNN( input_size=46, hid_size=hid_size, ) self.conv1 = ConvNormPool( input_size=input_size, hidden_size=hid_size, kernel_size=kernel_size, ) self.conv2 = ConvNormPool( input_size=hid_size, hidden_size=hid_size, kernel_size=kernel_size, ) self.avgpool = nn.AdaptiveMaxPool1d((1)) self.attn = nn.Linear(hid_size, hid_size, bias=False) self.fc = nn.Linear(in_features=hid_size, out_features=n_classes) def forward(self, input): x = self.conv1(input) x = self.conv2(x) x_out, hid_states = self.rnn_layer(x) x = torch.cat([hid_states[0], hid_states[1]], dim=0).transpose(0, 1) x_attn = torch.tanh(self.attn(x)) x = x_attn.bmm(x_out) x = x.transpose(2, 1) x = self.avgpool(x) x = x.view(-1, x.size(1) * x.size(2)) x = F.softmax(self.fc(x), dim=-1) return x class SuperNet(BaseModel): '''This is the parent of the net with some extra methods''' def __init__(self, model_config): super().__init__() self.net = Net() def loss(self, input: torch.Tensor): device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) return F.cross_entropy(output, labels.long()) def inference(self, input): device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) output = self.net.forward(features) n_samples = features.shape[0] accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item() return {'output':output, 'acc': accuracy, 'batch_size': n_samples} ================================================ FILE: experiments/ecg_cnn/readme.md ================================================ # Example of CNN-LSTM model on Arrhythmia dataset The objective of this experiment is to show the capabilities of FLUTE in data setting relevant to the healthcare sector. ### Federating the MIT-BIH Arrhythmia Dataset In this experiment, a processed version of [MIT-BIH Arrhythmia Dataset](https://www.physionet.org/content/mitdb/1.0.0/) is used. In particular, we are using the dataset version found on [this Kaggle competition](https://www.kaggle.com/shayanfazeli/heartbeat). Excerpt from the original [MIT-BIH Arrhythmia Database](https://physionet.org/content/mitdb/1.0.0/): > The MIT-BIH Arrhythmia Database contains 48 half-hour excerpts of two-channel ambulatory ECG recordings, obtained from 47 subjects studied by the BIH Arrhythmia Laboratory between 1975 and 1979. Twenty-three recordings were chosen at random from a set of 4000 24-hour ambulatory ECG recordings collected from a mixed population of inpatients (about 60%) and outpatients (about 40%) at Boston's Beth Israel Hospital; the remaining 25 recordings were selected from the same set to include less common but clinically significant arrhythmias that would not be well-represented in a small random sample. What this means for us: the federation in this example is a exemplar one, as the 47 subjects and their 48 half-hour excerpts are split up into the 109446 labeled samples of length 187. The sampling frequency is 125Hz and the number of categories is five. The categories are: ```['N': 0, 'S': 1, 'V': 2, 'F': 3, 'Q': 4]``` Or: ```-N : Non-ecotic beats (normal beat) -S : Supraventricular ectopic beats -V : Ventricular ectopic beats -F : Fusion Beats -Q : Unknown Beats``` The classes in the dataset are quite skewed; the *normal beats* class is present in 82.77% of samples. Using synthetic data could possibly increase the performance of the models by decreasing the class imbalance (e.g. by using [this GAN]([GitHub - mandrakedrink/ECG-Synthesis-and-Classification: 1D GAN for ECG Synthesis and 3 models: CNN, LSTM, and Attention mechanism for ECG Classification.](https://github.com/mandrakedrink/ECG-Synthesis-and-Classification)) for data synthesis) but is not too relevant for our experiment of transferring this experiment to FLUTE. #### Model architecture The model architecture is largely taken from [this notebook on Kaggle](https://www.kaggle.com/polomarco/ecg-classification-cnn-lstm-attention-mechanism). The architecture has been altered to fit the FLUTE architecture. The image below showcases the general model architecture. ![network](./net.png) The FLUTE-ready model can be found in `model.py`. Here, `SuperNet` is the parent class of the model various model network classes. `SuperNet` contains the `loss` and `inference` methods which FLUTE expects. `SuperNet` is therefore also the `model_type` set in `config.yaml`. The file `centralized_model.ipynb` can be used to test a centralized run of the model. Running this model expects the csv test and train files to be added to a `.\ecg_cnn\data\mitbih\` folder. This model has higher performance than the remote model (roughly 94% as opposed to 87% accuracy). This not fully unexpected, since the federated model could have more issues dealing with the class imbalance. #### Preparing the data First, place the `mitbih_test.csv` and `mitbig_train.csv` files in the folder `.\ecg_cnn\data\mitbih\`. Next, run preprocess.py in the `utils` folder to generate the HDF5 files. ## Specifying dataset and dataloaders Inside the `dataloaders` folder, there are two files: `dataset.py` and `dataloader.py`. Both inherit from the base classes declared in `core` folder, that under the hood inhereit from Pytorch classes with same name. The dataset should be able to access all the data, and store it in the attributes `user_list`, `user_data`, `user_data_labels` and `num_samples` (user names, user features, user labels if the problem is supervised, and number of samples for each user, respectively). These attributes are required to have these exact names. Otherwise, it should also be able to access the examples of a specific user, which id is passed during initialization via the `user_idx` argument. The dataloader is simpler, and essentially just instantiates the dataset and creates batches with a specific format. ## Creating a config file All the parameters of the experiment are passed in a YAML file. A documented example is provided in `config.yaml`. ## Running the experiment locally Finally, to launch the experiment, it suffices to launch the `e2e_trainer.py` script using torch.distributed: `python -m torch.distributed.run --nproc_per_node=2 .\e2e_trainer.py -dataPath experiments/ecg_cnn/data -outputPath scratch -config experiments/ecg_cnn/config.yaml -task ecg_cnn -backend nccl` The `dataPath`, `outputPath` and `config` arguments should just specify the respective files or folders, as in the example above -- in this case, a folder called `scratch` will be created containing logs and checkpoints. The task should be the name of the folder insider `experiments ## Running the experiments on Azure Machine Learning In order to run the experiment on Azure Machine Learning, you first need to follow the steps described [here](#Experiments). Make sure the HDF5 dataset is uploaded, the compute has a GPU and is running, and your YAML file is properly set up. An example file for running this experiment would be the following: ```yaml experiment_name: ecg_cnn_run description: FLUTE heartbeat dataset example code: local_path: . compute: azureml:compute_with_gpu environment: image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel inputs: data: folder: azureml://datastores/workspaceblobstore/paths/data mode: rw_mount command: > apt -y update && apt -y install openmpi-bin libopenmpi-dev openssh-client && python3 -m pip install --upgrade pip && python3 -m pip install -r requirements.txt && python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -outputPath=./outputs -dataPath={inputs.data} -task=ecg_cnn -config=./experiments/ecg_cnn/config.yaml -backend=nccl ``` To run your job, you can then use the following command: `az ml job create -f ./run.yaml -w "YourWorkspaceName" -g "YourResourceGroup"` The job should now be created and uploaded, after which it can be found in the AzureML Studio. ================================================ FILE: experiments/ecg_cnn/utils/preprocess.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import h5py import time import tqdm import csv import pandas as pd from sklearn.utils import resample def _dump_dict_to_hdf5(data_dict: dict, hdf5_file: h5py.File): '''Dump dict with expected structure to HDF5 file''' hdf5_file.create_dataset('users', data=data_dict['users']) hdf5_file.create_dataset('num_samples', data=data_dict['num_samples']) # Store actual data in groups user_data_group = hdf5_file.create_group('user_data') for user, user_data in tqdm.tqdm(data_dict['user_data'].items()): user_subgroup = user_data_group.create_group(user) user_subgroup.create_dataset('x', data=user_data) user_data_label_group = hdf5_file.create_group('user_data_label') for user, user_data_label in tqdm.tqdm(data_dict['user_data_label'].items()): user_data_label_group.create_dataset(user, data=user_data_label) def _process_and_save_to_disk(dataset, n_users, output): '''Process the dataset to expected format and save to disk''' # Split training data equally among all users total_samples = len(dataset) samples_per_user = total_samples // n_users assert total_samples % n_users == 0 # Function for getting a given user's data indices user_idxs = lambda user_id: slice(user_id * samples_per_user, (user_id + 1) * samples_per_user) # Convert training data to expected format print('Converting data to expected format...') start_time = time.time() data_dict = { # the data is expected to have this format 'users' : [f'{user_id:04d}' for user_id in range(n_users)], 'num_samples' : n_users * [samples_per_user], 'user_data' : {f'{user_id:04d}': dataset.data[user_idxs(user_id)] for user_id in range(n_users)}, 'user_data_label': {f'{user_id:04d}': dataset.targets[user_idxs(user_id)] for user_id in range(n_users)}, } print(f'Finished converting data in {time.time() - start_time:.2f}s.') # Save training data to disk print('Saving data to disk...') start_time = time.time() with h5py.File(output + '.hdf5', 'w') as hdf5_file: _dump_dict_to_hdf5(data_dict=data_dict, hdf5_file=hdf5_file) print(f'Finished saving data in {time.time() - start_time:.2f}s.') class HeartDataSet: def __init__(self, heartdata, cutoff): self.data = [row[:187] for row in heartdata][:cutoff] self.targets = [int(float(row[187])) for row in heartdata][:(round(len(heartdata), -3))][:cutoff] def __len__(self): return len(self.data) # From https://www.kaggle.com/gregoiredc/arrhythmia-on-ecg-classification-using-cnn/notebook # Can be used to creating resampled training set for less class imbalance def resampleSet(train_df): train_df[187]=train_df[187].astype(float).astype(int) df_1=train_df[train_df[187]==1] df_2=train_df[train_df[187]==2] df_3=train_df[train_df[187]==3] df_4=train_df[train_df[187]==4] df_0=(train_df[train_df[187]==0]).sample(n=40001,random_state=42) df_1_upsample=resample(df_1,replace=True,n_samples=10000,random_state=123) df_2_upsample=resample(df_2,replace=True,n_samples=20000,random_state=124) df_3_upsample=resample(df_3,replace=True,n_samples=5000,random_state=125) df_4_upsample=resample(df_4,replace=True,n_samples=20000,random_state=126) train_df=pd.concat([df_0,df_1_upsample,df_2_upsample,df_3_upsample,df_4_upsample]) return train_df # Uncomment lines below for resampled dataset with open('../data/mitbih/mitbih_test.csv') as f: testset = list(csv.reader(f , delimiter=',')) TestDataset = HeartDataSet(testset, 21000) _process_and_save_to_disk(TestDataset,1000,'../data/test_data') with open('../data/mitbih/mitbih_train.csv') as f: trainset = csv.reader(f , delimiter=',') trainsetlist = list(trainset) TrainDataset = HeartDataSet(trainsetlist, 87000) _process_and_save_to_disk(TrainDataset,1000,'../data/train_data') ================================================ FILE: experiments/fednewsrec/README.md ================================================ ### Data In order to run this experiment, you need to previously download the MIND dataset [here](https://msnews.github.io/index.html) and the glove.840B.300d embbeding vector [here](https://nlp.stanford.edu/projects/glove/). Once you have the data, make sure to replace the `root_data_path` and `embedding_path` parameters inside [dataset.py](dataloaders/dataset.py) and [configuration file](config.yaml). The preprocessing steps will be done automatically by FLUTE once the jobs is launched. ### Run Once the paths for the dataset and embedding have been updated, you can run the experiment as follows: ```code python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest -config ./experiments/fednewsrec/config.yaml -task fednewsrec -backend nccl ``` ### Results - MIND_Large, 1500 rounds, 6 clients per round: |Platform|AUC|MRR|nDCG5|nDCG10| |:----|:----|:----|:----|:----| |FedNews|0.54|0.23|0.25|0.32| |FLUTE|0.58|0.24|0.26|0.33| ================================================ FILE: experiments/fednewsrec/config.yaml ================================================ # Parameters needed to initialize the model model_config: model_type: FEDNEWS # class w/ `loss` and `inference` methods model_folder: experiments/fednewsrec/model.py # file containing class embbeding_path: /mnt/data/MIND_large # Configuration for differential privacy dp_config: enable_local_dp: false # whether to enable user-level DP # Additional privacy metrics privacy_metrics_config: apply_metrics: false # cache data to compute additional metrics # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx) strategy: FedAvg # Determines all the server-side settings for training and evaluation rounds server_config: wantRL: false # whether to use RL-based meta-optimizers resume_from_checkpoint: true # restart from checkpoint if file exists do_profiling: false # run profiler and compute runtime metrics optimizer_config: # this is the optimizer used to update the model type: sgd lr: 1.0 annealing_config: # annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 50 # how many iterations between metric eval on val set rec_freq: 2000 # how many iterations between metric eval on test set initial_val: true initial_rec: false max_iteration: 1500 # how many iterations in total num_clients_per_iteration: 500 # how many clients per iteration data_config: # where to get val and test data from val: batch_size: 1 val_data: null # Assigned to null because dataset is being instantiated test: batch_size: 1 test_data: null # Assigned to null because dataset is being instantiated type: model_optimization aggregate_median: softmax # how aggregations weights are computed initial_lr_client: 0.1 # learning rate used on client optimizer lr_decay_factor: 1.0 weight_train_loss: train_loss best_model_criterion: auc fall_back_to_best_model: false softmax_beta: 1.0 # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: do_profiling: false # run profiling and compute runtime metrics ignore_subtask: false data_config: # where to get training data from train: batch_size: 64 list_of_train_data: null # Assigned to null because dataset is being instantiated desired_max_samples: 50000 optimizer_config: # this is the optimizer used by the client type: sgd lr: 0.1 # this is overridden by `initial_lr_client` type: optimization ================================================ FILE: experiments/fednewsrec/dataloaders/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch import numpy as np from core.dataloader import BaseDataLoader from experiments.fednewsrec.dataloaders.dataset import Dataset class DataLoader(BaseDataLoader): def __init__(self, mode, num_workers=0, **kwargs): args = kwargs['args'] self.batch_size = args['batch_size'] self.mode = mode dataset = Dataset( data=kwargs['data'], test_only=(not mode=='train'), user_idx=kwargs.get('user_idx', None), ) super().__init__( dataset, batch_size=self.batch_size, shuffle=(mode=='train'), num_workers=num_workers, collate_fn=self.collate_fn, ) def collate_fn(self, batch): if self.mode == "train": # For training click, sample, label = list(zip(*batch)) click = torch.tensor(click) sample = torch.tensor(sample) label = torch.tensor(label) return {'x': (click, sample), 'y': label} else: # For testing -- data format is different nv_hist = torch.stack(batch[0][0]).squeeze(1) nv_imp = torch.stack(batch[0][1]).squeeze(1) label = batch[0][2] return {'x': (nv_hist, nv_imp), 'y': label} ================================================ FILE: experiments/fednewsrec/dataloaders/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np import torch from core.dataset import BaseDataset from experiments.fednewsrec.dataloaders.preprocess_mind import MIND class Dataset(BaseDataset): def __init__(self, data, test_only=False, user_idx=0, **kwargs): self.test_only = test_only self.user_idx = user_idx # Get all data self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only) if user_idx != -1: if self.test_only: # combine all data into single array self.user = 'test_only' self.labels = [user_label for user_label in self.user_data_label.values()] self.features_x = [user_data['x'] for user_data in self.user_data.values()] self.features_y = [user_data['y'] for user_data in self.user_data.values()] else: # get a single user's data if user_idx is None: raise ValueError('in train mode, user_idx must be specified') self.user = self.user_list[user_idx] self.features_x = self.user_data[self.user]['x'] self.features_y = self.user_data[self.user]['y'] self.labels = self.user_data_label[self.user] def __getitem__(self, idx): return self.features_x[idx], self.features_y[idx], self.labels[idx] def __len__(self): return len(self.features_x) def load_data(self, data, test_only): '''Wrapper method to read/instantiate the dataset''' if data == None: dataset = MIND(root_data_path="/mnt/data/MIND_large", embedding_path="/mnt/data/MIND_large") data = dataset.testset if test_only else dataset.trainset users = data['users'] features = data['user_data'] labels = data['user_data_label'] num_samples = data['num_samples'] return users, features, labels, num_samples ================================================ FILE: experiments/fednewsrec/dataloaders/preprocess_mind.py ================================================ from nltk.tokenize import word_tokenize import random import os import numpy as np import torch MAX_SENTENCE = 30 MAX_ALL = 50 npratio = 4 ''' Preprocessing steps for MIND dataset were taken from the FedNewsRec-EMNLP-Findings-2020 repository, for more information please refer to https://github.com/taoqi98/FedNewsRec/blob/master/code/preprecoess.py. ''' class MIND: def __init__(self, root_data_path, embedding_path) : # Preprocessing news,news_index,category_dict,subcategory_dict,word_dict = read_news(root_data_path,['train','val']) news_title,news_vert,news_subvert=get_doc_input(news,news_index,category_dict,subcategory_dict,word_dict) title_word_embedding_matrix, have_word = load_matrix(embedding_path,word_dict) train_session, train_uid_click, train_uid_table = read_clickhistory(root_data_path,'train') test_session, test_uid_click,test_uid_table = read_clickhistory(root_data_path,'val') train_user = parse_user(train_session,news_index) test_user = parse_user(test_session,news_index) train_sess, train_user_id, train_label, train_user_id_sample = get_train_input(train_session,train_uid_click,news_index) test_impressions, test_userids = get_test_input(test_session,news_index) get_user_data = GetUserDataFunc(news_title,train_user_id_sample,train_user,train_sess,train_label,train_user_id) # Return datasets print("Preparing train datasets ...") train_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()} for uidx in range(50000): uid = train_uid_table[uidx] click, sample, label = get_user_data(uid) user = str(uidx) # uid train_dict['users'].append(user) train_dict['num_samples'].append(len(click)) train_dict['user_data'][user] = {'x': click, 'y': sample} train_dict['user_data_label'][user] = label print("Preparing test datasets ...") test_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()} doc_cache = [] for j in range(len(news_title)): doc_cache.append(torch.from_numpy(np.array([news_title[j]]))) for i in range(len(test_impressions)): docids = test_impressions[i]['docs'] labels = test_impressions[i]['labels'] nv_hist = [doc_cache[j] for j in test_user['click'][i]] nv_imp = [doc_cache[j] for j in docids] user = str(i) test_dict['users'].append(user) test_dict['num_samples'].append(len(nv_imp)) test_dict['user_data'][user] = {'x':nv_hist, 'y':nv_imp} test_dict['user_data_label'][user] = labels self.trainset = train_dict self.testset = test_dict def GetUserDataFunc(news_title,train_user_id_sample,train_user,train_sess,train_label,train_user_id): def _get_user_data(uid): click = [] sample = [] label = [] for sid in train_user_id_sample[uid]: click.append(train_user['click'][train_user_id[sid]]) sample.append(train_sess[sid]) label.append(train_label[sid]) click = np.array(click) sample = np.array(sample) label = np.array(label) click = news_title[click] sample = news_title[sample] return click,sample,label return _get_user_data def newsample(nnn,ratio): if ratio >len(nnn): return random.sample(nnn*(ratio//len(nnn)+1),ratio) else: return random.sample(nnn,ratio) def read_news(root_data_path,modes): news={} category=[] subcategory=[] news_index={} index=1 word_dict={} word_index=1 for mode in modes: with open(os.path.join(root_data_path,mode,'news.tsv'), encoding="utf8") as f: lines = f.readlines() for line in lines: splited = line.strip('\n').split('\t') doc_id,vert,subvert,title= splited[0:4] if doc_id in news_index: continue news_index[doc_id]=index index+=1 category.append(vert) subcategory.append(subvert) title = title.lower() title=word_tokenize(title) news[doc_id]=[vert,subvert,title] for word in title: word = word.lower() if not(word in word_dict): word_dict[word]=word_index word_index+=1 category=list(set(category)) subcategory=list(set(subcategory)) category_dict={} index=1 for c in category: category_dict[c]=index index+=1 subcategory_dict={} index=1 for c in subcategory: subcategory_dict[c]=index index+=1 return news,news_index,category_dict,subcategory_dict,word_dict def get_doc_input(news,news_index,category,subcategory,word_dict): news_num=len(news)+1 news_title=np.zeros((news_num,MAX_SENTENCE),dtype='int32') news_vert=np.zeros((news_num,),dtype='int32') news_subvert=np.zeros((news_num,),dtype='int32') for key in news: vert,subvert,title=news[key] doc_index=news_index[key] news_vert[doc_index]=category[vert] news_subvert[doc_index]=subcategory[subvert] for word_id in range(min(MAX_SENTENCE,len(title))): news_title[doc_index,word_id]=word_dict[title[word_id].lower()] return news_title,news_vert,news_subvert def load_matrix(embedding_path,word_dict): embedding_matrix = np.zeros((len(word_dict)+1,300)) have_word=[] with open(os.path.join(embedding_path,'glove.840B.300d.txt'),'rb') as f: while True: l=f.readline() if len(l)==0: break l=l.split() word = l[0].decode() if word in word_dict: index = word_dict[word] tp = [float(x) for x in l[1:]] embedding_matrix[index]=np.array(tp) have_word.append(word) return embedding_matrix,have_word def read_clickhistory(root_data_path,mode): lines = [] userids = {} uid_table = {} with open(os.path.join(root_data_path,mode,'behaviors.tsv')) as f: lines = f.readlines() sessions = [] for i in range(len(lines)): _,uid,_,click,imp = lines[i].strip().split('\t') true_click = click.split() assert not '' in true_click if not uid in userids: uid_table[len(userids)] = uid userids[uid] = [] userids[uid].append(i) imp = imp.split() pos = [] neg = [] for beh in imp: nid, label = beh.split('-') if label == '0': neg.append(nid) else: pos.append(nid) sessions.append([true_click,pos,neg]) return sessions,userids,uid_table def parse_user(session,news_index): user_num = len(session) user={'click': np.zeros((user_num,MAX_ALL),dtype='int32'),} for user_id in range(len(session)): tclick = [] click, pos, neg =session[user_id] for i in range(len(click)): tclick.append(news_index[click[i]]) click = tclick if len(click) >MAX_ALL: click = click[-MAX_ALL:] else: click=[0]*(MAX_ALL-len(click)) + click user['click'][user_id] = np.array(click) return user def get_train_input(session,uid_click_talbe,news_index): inv_table = {} user_id_session = {} for uid in uid_click_talbe: user_id_session[uid] = [] for v in uid_click_talbe[uid]: inv_table[v] = uid sess_pos = [] sess_neg = [] user_id = [] for sess_id in range(len(session)): sess = session[sess_id] _, poss, negs=sess for i in range(len(poss)): pos = poss[i] neg=newsample(negs,npratio) sess_pos.append(pos) sess_neg.append(neg) user_id.append(sess_id) user_id_session[inv_table[sess_id]].append(len(sess_pos)-1) sess_all = np.zeros((len(sess_pos),1+npratio),dtype='int32') label = np.zeros((len(sess_pos),1+npratio)) for sess_id in range(sess_all.shape[0]): pos = sess_pos[sess_id] negs = sess_neg[sess_id] sess_all[sess_id,0] = news_index[pos] index = 1 for neg in negs: sess_all[sess_id,index] = news_index[neg] index+=1 #index = np.random.randint(1+npratio) label[sess_id,0]=1 user_id = np.array(user_id, dtype='int32') return sess_all, user_id, label, user_id_session def get_test_input(session,news_index): Impressions = [] userid = [] for sess_id in range(len(session)): _, poss, negs = session[sess_id] imp = {'labels':[], 'docs':[]} userid.append(sess_id) for i in range(len(poss)): docid = news_index[poss[i]] imp['docs'].append(docid) imp['labels'].append(1) for i in range(len(negs)): docid = news_index[negs[i]] imp['docs'].append(docid) imp['labels'].append(0) Impressions.append(imp) userid = np.array(userid,dtype='int32') return Impressions, userid, ================================================ FILE: experiments/fednewsrec/fednewsrec_model.py ================================================ import torch import torch.nn as nn import numpy as np npratio = 4 ''' The FedNewsRec model is taken from FedNewsRec-EMNLP-Findings-2020 repository and ported to PyTorch framework to be compatible with FLUTE (https://github.com/simra/FedNewsRec#fednewsrec-emnlp-findings-2020). For more information regarding this model, please refer to https://github.com/taoqi98/FedNewsRec. ''' class AttentivePooling(nn.Module): def __init__(self, dim1: int, dim2: int): super(AttentivePooling, self).__init__() self.dim1 = dim1 self.dim2 = dim2 self.dropout = nn.Dropout(0.2) self.dense = nn.Linear(dim2, 200) self.tanh = nn.Tanh() self.dense2 = nn.Linear(200, 1) self.softmax = nn.Softmax(dim=1) def forward(self, x): user_vecs = self.dropout(x) user_att = self.tanh(self.dense(user_vecs)) user_att = self.dense2(user_att).squeeze(2) user_att = self.softmax(user_att) result = torch.einsum('ijk,ij->ik', user_vecs, user_att) return result def fromTensorFlow(self, tfmodel): keras_weights = tfmodel.layers[1].get_weights() # print(keras_weights) self.dense.weight.data = torch.tensor(keras_weights[0]).transpose(0,1).cuda() self.dense.bias.data = torch.tensor(keras_weights[1]).cuda() keras_weights = tfmodel.layers[2].get_weights() # print(keras_weights) self.dense2.weight.data = torch.tensor(keras_weights[0]).transpose(0,1).cuda() self.dense2.bias.data = torch.tensor(keras_weights[1]).cuda() class Attention(nn.Module): def __init__(self, input_dim, nb_head, size_per_head, **kwargs): super(Attention, self).__init__(**kwargs) #self.input_shape = input_shape self.input_dim = input_dim self.nb_head = nb_head self.size_per_head = size_per_head self.output_dim = nb_head*size_per_head #self.WQ = nn.Linear(self.input_shape[0][-1], self.output_dim, bias=False) #self.WK = nn.Linear(self.input_shape[1][-1], self.output_dim, bias=False) #self.WV = nn.Linear(self.input_shape[2][-1], self.output_dim, bias=False) self.WQ = nn.Linear(self.input_dim, self.output_dim, bias=False) self.WK = nn.Linear(self.input_dim, self.output_dim, bias=False) self.WV = nn.Linear(self.input_dim, self.output_dim, bias=False) torch.nn.init.xavier_uniform_(self.WQ.weight, gain=np.sqrt(2)) torch.nn.init.xavier_uniform_(self.WK.weight, gain=np.sqrt(2)) torch.nn.init.xavier_uniform_(self.WV.weight, gain=np.sqrt(2)) def fromTensorFlow(self, tf, criteria = lambda l: l.name.startswith('attention')): for l in tf.layers: print(l.name, l.output_shape) if criteria(l): weights = l.get_weights() self.WQ.weight.data = torch.tensor(weights[0].transpose()).cuda() self.WK.weight.data = torch.tensor(weights[1].transpose()).cuda() self.WV.weight.data = torch.tensor(weights[2].transpose()).cuda() def forward(self, x): if len(x) == 3: Q_seq,K_seq,V_seq = x Q_len,V_len = None,None Q_seq = self.WQ(Q_seq) Q_seq = torch.reshape(Q_seq, (-1, Q_seq.shape[1], self.nb_head, self.size_per_head)) #Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3)) Q_seq = torch.transpose(Q_seq, 1, 2) K_seq = self.WK(K_seq) K_seq = torch.reshape(K_seq, (-1, K_seq.shape[1], self.nb_head, self.size_per_head)) K_seq = torch.transpose(K_seq, 1, 2) V_seq = self.WV(V_seq) V_seq = torch.reshape(V_seq, (-1, V_seq.shape[1], self.nb_head, self.size_per_head)) V_seq = torch.transpose(V_seq, 1, 2) #print('pt shapes') #print(Q_seq.shape, K_seq.shape) #A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5 A = torch.einsum('ijkl,ijml->ijkm', Q_seq, K_seq) / self.size_per_head**0.5 # A = K.permute_dimensions(A, (0,3,2,1)) # A = self.Mask(A, V_len, 'add') # A = K.permute_dimensions(A, (0,3,2,1)) A = torch.softmax(A, dim=-1) #输出并mask #O_seq = K.batch_dot(A, V_seq, axes=[3,2]) O_seq = torch.einsum('ijkl,ijlm->ijkm', A, V_seq) #O_seq = K.permute_dimensions(O_seq, (0,2,1,3)) O_seq = torch.transpose(O_seq, 1,2) #O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim)) O_seq = torch.reshape(O_seq, (-1, O_seq.shape[1], self.output_dim)) #O_seq = self.Mask(O_seq, Q_len, 'mul') return O_seq class Permute(nn.Module): def __init__(self, *dims): super(Permute, self).__init__() self.dims = dims def forward(self, x): return x.permute(*self.dims) class SwapTrailingAxes(nn.Module): def __init__(self): super(SwapTrailingAxes, self).__init__() def forward(self, x): return x.transpose(-2, -1) class DocEncoder(nn.Module): def __init__(self): super(DocEncoder,self).__init__() self.phase1 = nn.Sequential( nn.Dropout(0.2), # TODO: why we need the SwapTrailingAxes here? SwapTrailingAxes(), nn.Conv1d(300, 400, 3), nn.ReLU(), nn.Dropout(0.2), # TODO: seems here we swap the dimension back. why? SwapTrailingAxes() ) #self.attention = nn.MultiheadAttention(400, 20, batch_first=True) self.attention = Attention(400, 20, 20) # Pytorch MultiheadAttention has in_proj_weight of size (3*embed_dim, embed_dim) # Thus, we need to scale the xavier by sqrt(2) #torch.nn.init.xavier_uniform_(self.attention.in_proj_weight, gain=np.sqrt(2)) self.phase2 = nn.Sequential( nn.ReLU(), nn.Dropout(0.2), AttentivePooling(30,400) ) def fromTensorFlow(self, tfDoc): print('td') for l in self.phase1: if 'conv' in l._get_name().lower(): print('conv shape:',l.weight.data.shape, l.bias.data.shape) #print('\t',[p[0] for p in l.named_parameters()]) for lt in tfDoc.layers: print(lt.name, lt.output_shape) if 'conv' in lt.name.lower(): weights = lt.get_weights() l.weight.data = torch.tensor(weights[0]).transpose(0,2).cuda() l.bias.data = torch.tensor(weights[1]).cuda() #print(len(l.get_weights()), [p.shape for p in l.get_weights()]) break break #for lt in tfDoc.layers: # print('tf2') # print(lt.name, lt.output_shape) # if 'attention' in lt.name: # TODO: we should just pass the specific layer self.attention.fromTensorFlow(tfDoc) print('phase2') for l in self.phase2: if 'attentive' in l._get_name().lower(): for lt in tfDoc.layers: print(lt.name) if 'model' in lt.name.lower(): print('copying attentive pooling') l.fromTensorFlow(lt) def forward(self, x): # print(x.shape) l_cnnt = self.phase1(x) # print('doc_encoder:phase1',l_cnnt.shape) l_cnnt = self.attention([l_cnnt]*3) # print('doc_encoder:attention', l_cnnt.shape) result = self.phase2(l_cnnt) # print('doc_encoder:phase2', result.shape) return result class VecTail(nn.Module): def __init__(self, n): super(VecTail, self).__init__() self.n = n def forward(self, x): return x[:,-self.n:,:] class UserEncoder(nn.Module): def __init__(self): super(UserEncoder,self).__init__() # news_vecs_input = Input(shape=(50,400), dtype='float32') #self.dropout1 = nn.Dropout(0.2) #self.tail = VecTail(15) #self.gru = nn.GRU(400, 400) #self.attention = nn.MultiheadAttention(400, 20) #self.pool = AttentivePooling(50, 400) #self.attention2 = nn.MultiheadAttention(400, 20, batch_first=True) self.attention2 = Attention(400, 20, 20) #torch.nn.init.xavier_uniform_(self.attention2.in_proj_weight, gain=np.sqrt(2)) self.dropout2 = nn.Dropout(0.2) self.pool2 = AttentivePooling(50, 400) self.tail2 = VecTail(20) #TODO: what is batch_first? self.gru2 = nn.GRU(400,400, bidirectional=False, batch_first=True) self.pool3 = AttentivePooling(2, 400) def forward(self, news_vecs_input): #news_vecs =self.dropout1(news_vecs_input) #gru_input = self.tail(news_vecs) #vec1 = self.gru(gru_input) #vecs2 = self.attention(*[news_vecs]*3) #vec2 = self.pool(vecs2) # print('news_vecs_input', news_vecs_input.shape) user_vecs2 = self.attention2([news_vecs_input]*3) user_vecs2 = self.dropout2(user_vecs2) user_vec2 = self.pool2(user_vecs2) # print('pool2_user_vec2', user_vec2.shape) #user_vec2 = keras.layers.Reshape((1,400))(user_vec2) #user_vec2 = user_vec2.unsqueeze(1) user_vecs1 = self.tail2(news_vecs_input) # print('tail2_user_vecs1', user_vecs1.shape) self.gru2.flatten_parameters() user_vec1, _u_hidden = self.gru2(user_vecs1) # print('gru2_user_vec1', user_vec1.shape) # TODO: does this flatten the second dimension? print out the shape to check user_vec1 = user_vec1[:, -1, :] #user_vec1 = keras.layers.Reshape((1,400))(user_vec1) #user_vec1 = user_vec1.unsqueeze(1) user_vecs = torch.stack([user_vec1, user_vec2], dim=1) #keras.layers.Concatenate(axis=-2)([user_vec1,user_vec2]) # print(user_vecs.shape) vec = self.pool3(user_vecs) # print(vec.shape) return vec def fromTensorFlow(self, tfU): for l in tfU.layers: print(l.name, l.output_shape) if l.name == 'model_1': self.pool2.fromTensorFlow(l) elif l.name == 'model_2': self.pool3.fromTensorFlow(l) elif l.name=='gru_1': print(len(l.get_weights()), [p.shape for p in l.get_weights()]) weights = l.get_weights() for p in self.gru2.named_parameters(): s1 = p[1].data.shape if p[0] == 'weight_ih_l0': p[1].data = torch.tensor(weights[0]).transpose(0,1).contiguous().cuda() elif p[0] == 'weight_hh_l0': p[1].data = torch.tensor(weights[1]).transpose(0,1).contiguous().cuda() elif p[0] == 'bias_ih_l0': p[1].data = torch.tensor(weights[2]).cuda() elif p[0] == 'bias_hh_l0': p[1].data = torch.zeros(p[1].data.shape).cuda() print(p[0], s1, p[1].shape) self.attention2.fromTensorFlow(tfU) # TODO: GRU class TimeDistributed(nn.Module): def __init__(self, module): #, batch_first=False): super(TimeDistributed, self).__init__() self.module = module # self.batch_first = batch_first def forward(self, x): # print('TimeDist_x',x.size()) if len(x.size()) <= 2: return self.module(x) output = torch.tensor([]).cuda(x.get_device()) for i in range(x.size(1)): output_t = self.module(x[:, i, :, :]) output_t = output_t.unsqueeze(1) output = torch.cat((output, output_t ), 1) # print('TimeDist_output', output.size()) return output # # Squash samples and timesteps into a single axis # x_reshape = x.contiguous().view(x.size(0), -1, x.size(-1)) # (samples * timesteps, input_size) #print('TimeDist_x_reshape',x_reshape.shape) # y = self.module(x_reshape) # print('TimeDist_y', y.shape) # # We have to reshape Y # if self.batch_first: # y = y.contiguous().view(x.size(0), -1, y.size(-1)) # (samples, timesteps, output_size) # else: # y = y.view(-1, x.size(1), y.size(-1)) # (timesteps, samples, output_size) # print('TimeDist_y_reshape',y.size()) #return y class FedNewsRec(nn.Module): def __init__(self, title_word_embedding_matrix): super(FedNewsRec, self).__init__() self.doc_encoder = DocEncoder() self.user_encoder = UserEncoder() self.title_word_embedding_layer = nn.Embedding.from_pretrained(torch.tensor(title_word_embedding_matrix, dtype=torch.float), freeze=True) # click_title = Input(shape=(50,30),dtype='int32') # can_title = Input(shape=(1+npratio,30),dtype='int32') self.softmax = nn.Softmax(dim=1) self.click_td = TimeDistributed(self.doc_encoder) #, batch_first=True) self.can_td = TimeDistributed(self.doc_encoder) #, batch_first=True) def forward(self, click_title, can_title): click_word_vecs = self.title_word_embedding_layer(click_title) # print('click', click_word_vecs.shape, click_word_vecs.type) can_word_vecs = self.title_word_embedding_layer(can_title) # print('can', can_word_vecs.shape, can_word_vecs.type) click_vecs = self.click_td(click_word_vecs) # print('click_vecs (None, 50, 400)', click_vecs.shape) can_vecs = self.can_td(can_word_vecs) # print('can_vecs (None, 5, 400)', can_vecs.shape) user_vec = self.user_encoder(click_vecs) # print('user_vec (None, 400)', user_vec.shape) # TODO verify scores = torch.einsum('ijk,ik->ij', can_vecs, user_vec) #if verbose: # print('model scores:', scores.detach().cpu().numpy()) # print('scores (None, 5)', scores.shape) #logits = self.softmax(scores) # pytorch crossentropyloss function accepts unnormalized scores. logits = scores # print('logits (None, 5)', logits.shape) #news_word_vecs = self.title_word_embedding_layer(news_input) #news_vec = self.doc_encoder(news_word_vecs) # print('user_vec', user_vec.shape) # print('news_vec', news_vec.shape) return logits, user_vec #, news_vec def news_encoder(self, news_title): news_word_vecs = self.title_word_embedding_layer(news_title) news_vec = self.doc_encoder(news_word_vecs) return news_vec ================================================ FILE: experiments/fednewsrec/model.py ================================================ import os import torch from torch.nn import CrossEntropyLoss from torch.nn import functional as F import numpy as np from sklearn.metrics import roc_auc_score from nltk.tokenize import word_tokenize from core.model import BaseModel from experiments.fednewsrec.utils import ndcg_score, mrr_score from experiments.fednewsrec.fednewsrec_model import FedNewsRec ''' The FedNewsRec model is taken from FedNewsRec-EMNLP-Findings-2020 repository and ported to PyTorch framework to be compatible with FLUTE (https://github.com/simra/FedNewsRec#fednewsrec-emnlp-findings-2020). For more information regarding this model, please refer to https://github.com/taoqi98/FedNewsRec. ''' class FEDNEWS(BaseModel): '''This is a PyTorch model with some extra methods''' def __init__(self, model_config): super().__init__() root_data_path = model_config['embbeding_path'] embedding_path = model_config['embbeding_path'] news,news_index,category_dict,subcategory_dict,word_dict = self.read_news(root_data_path,['train','val']) title_word_embedding_matrix, _ = self.load_matrix(embedding_path,word_dict) self.net = FedNewsRec(title_word_embedding_matrix) def loss(self, input: torch.Tensor) -> torch.Tensor: '''Performs forward step and computes the loss''' if not self.net.training: return torch.tensor(0) # Not using the loss during evaluation device = 'cuda' if torch.cuda.is_available() else 'cpu' (click, sample), label = input['x'], input['y'] click = click.to(device) sample = sample.to(device) label = label.to(device) criterion = CrossEntropyLoss() output, _ = self.net.forward(click, sample) return criterion(output, label) def inference(self, input): '''Performs forward step and computes metrics''' device = 'cuda' if torch.cuda.is_available() else 'cpu' (nv_hist, nv_imp), labels = input['x'], input['y'] nv_hist = nv_hist.to(device) nv_imp = nv_imp.to(device) nv = self.net.news_encoder(nv_imp).detach().cpu().numpy() # news vector? nv_hist = self.net.news_encoder(nv_hist) uv = self.net.user_encoder(nv_hist.unsqueeze(0)).detach().cpu().numpy()[0] # user vector? score = np.dot(nv,uv) auc = roc_auc_score(labels,score) mrr = mrr_score(labels,score) acc = ndcg_score(labels,score,k=1) ndcg5 = ndcg_score(labels,score,k=5) ndcg10 = ndcg_score(labels,score,k=10) return {'output':None, 'acc': acc, 'batch_size': 1, \ 'auc': {'value':auc,'higher_is_better': True}, 'mrr': {'value':mrr,'higher_is_better': True}, 'ndcg5': {'value':ndcg5,'higher_is_better': True}, 'ndcg10': {'value':ndcg10,'higher_is_better': True}} def read_news(self, root_data_path, modes): news={} category=[] subcategory=[] news_index={} index=1 word_dict={} word_index=1 for mode in modes: with open(os.path.join(root_data_path,mode,'news.tsv'), encoding="utf8") as f: lines = f.readlines() for line in lines: splited = line.strip('\n').split('\t') doc_id,vert,subvert,title= splited[0:4] if doc_id in news_index: continue news_index[doc_id]=index index+=1 category.append(vert) subcategory.append(subvert) title = title.lower() title=word_tokenize(title) news[doc_id]=[vert,subvert,title] for word in title: word = word.lower() if not(word in word_dict): word_dict[word]=word_index word_index+=1 category=list(set(category)) subcategory=list(set(subcategory)) category_dict={} index=1 for c in category: category_dict[c]=index index+=1 subcategory_dict={} index=1 for c in subcategory: subcategory_dict[c]=index index+=1 return news,news_index,category_dict,subcategory_dict,word_dict def load_matrix(self, embedding_path,word_dict): embedding_matrix = np.zeros((len(word_dict)+1,300)) have_word=[] with open(os.path.join(embedding_path,'glove.840B.300d.txt'),'rb') as f: while True: l=f.readline() if len(l)==0: break l=l.split() word = l[0].decode() if word in word_dict: index = word_dict[word] tp = [float(x) for x in l[1:]] embedding_matrix[index]=np.array(tp) have_word.append(word) return embedding_matrix,have_word ================================================ FILE: experiments/fednewsrec/utils.py ================================================ import numpy as np def mrr_score(y_true, y_score): order = np.argsort(y_score)[::-1] y_true = np.take(y_true, order) rr_score = y_true / (np.arange(len(y_true)) + 1) return np.sum(rr_score) / np.sum(y_true) def ndcg_score(y_true, y_score, k=10): best = dcg_score(y_true, y_true, k) actual = dcg_score(y_true, y_score, k) return actual / best def dcg_score(y_true, y_score, k=10): order = np.argsort(y_score)[::-1] y_true = np.take(y_true, order[:k]) gains = 2 ** y_true - 1 discounts = np.log2(np.arange(len(y_true)) + 2) return np.sum(gains / discounts) ================================================ FILE: experiments/mlm_bert/README.md ================================================ # Simple example of a MLM task on Reddit Dataset Instructions on how to run the experiment, given below. ## Preparing the data For this experiment, we can create a dummy dataset by running the script located in `testing/create_data.py` as follows: ```code python create_data.py --task mlm_bert ``` A couple of scripts are provided in `utils/preprocessing` for preprocessing .tsv files in case you want to use your own data. ## Creating a config file All the parameters of the experiment are passed in a YAML file. An example is provided in `configs/hello_world_mlm_bert_json.yaml` with the suggested parameters to do a simple run for this experiment. Make sure to point your training files at the fields: list_of_train_data, test_data and val_data inside the config file. ## Running the experiment locally Finally, to launch the experiment, it suffices to launch the `e2e_trainer.py` script using torch.distributed: ```code python -m torch.distributed.run --nproc_per_node=2 .\e2e_trainer.py -dataPath data_folder -outputPath scratch -config configs\hello_world_mlm_bert_json.yaml -task mlm_bert -backend nccl ``` For submitting jobs in Azure ML, we have included the instructions in the `Experiments` section of the main `README.md`. ================================================ FILE: experiments/mlm_bert/config.py ================================================ from __future__ import annotations from dataclasses import dataclass import sys sys.path.append('../../') from core.config import ModelConfig, Config, from_dict @dataclass class BERTModelConfig(Config): """BERT model configuration The BERT configuration specifies huggingface-specific BERT model settings. Attributes: model_name (str): The name of the BERT model. eg bert-base-uncased. cache_dir (str): Tokenizer cache directory, will be created if it doesn't exist. use_fast_tokenizer (bool): Whether to use the fast tokenizer. mask_token (str): special token to use for masking. task (str): The task to use for BERT. eg mlm. past_index (int): The index of the past state in the BERT model's state dict. prediction_loss_only (bool): if False, also produce metrics for predictions and labels. process_line_by_line (bool): if True, process the input line-by-line. ToDo: * check how cache_dir is used- there's a risk of multiple processes reading/writing at the same time. * verify the meaning of past_index (thanks copilot) * document the difference when process_line_by_line is True vs False """ model_name: str = None cache_dir: str = None use_fast_tokenizer: bool = False mask_token: str = '' task: str = 'mlm' past_index: int | None = -2 prediction_loss_only: bool = False process_line_by_line: bool = False @staticmethod def from_dict(config) -> BERTModelConfig: return from_dict(BERTModelConfig, config) @dataclass class BERTTrainingConfig(Config): """BERT training configuration Configuration settings for BERT training. Attributes: seed (int): random seed for reproducibility. label_smoothing_factor (float): label smoothing factor. Applied label smoothing when the factor is non-zero. batch_size (int): batch size. max_seq_length (int): maximum input sequence length. """ seed: int | None = None label_smoothing_factor: float | None = None batch_size: int | None = None max_seq_length: int | None = None @staticmethod def from_dict(config) -> BERTTrainingConfig: return from_dict(BERTTrainingConfig, config) @dataclass class BERTSpecificConfig(Config): """BERT configuration Specifies the model and training configuration for huggingface modeling scenarios. Attributes: loader_type (str): loader type hint. eg 'text' model (BERTModelConfig): BERT model configuration. training (BERTTrainingConfig): BERT training configuration. """ loader_type: str = None model: BERTModelConfig = None training: BERTTrainingConfig = None @staticmethod def from_dict(config) -> BERTSpecificConfig: result = BERTSpecificConfig() for k in config: if k == 'model': result.model = BERTModelConfig.from_dict(config[k]) elif k == 'training': result.training = BERTTrainingConfig.from_dict(config[k]) else: setattr(result, k, config[k]) return result @dataclass class BERTConfig(ModelConfig): """ Expected MLM config wraps the BERTSpecificConfig as a sub-field of the ModelConfig. """ BERT: BERTSpecificConfig = None @staticmethod def from_dict(config) -> ModelConfig: result = BERTConfig() for k in config: if k=="BERT": result.BERT = BERTConfig.from_dict(config[k]) else: setattr(result, k, config[k]) return result ================================================ FILE: experiments/mlm_bert/dataloaders/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from transformers.data.data_collator import default_data_collator, DataCollatorWithPadding from torch.utils.data import RandomSampler, SequentialSampler from transformers import AutoTokenizer from transformers import DataCollatorForLanguageModeling from experiments.mlm_bert.dataloaders.dataset import Dataset from core.dataloader import BaseDataLoader from utils import print_rank import logging class DataLoader(BaseDataLoader): """ PyTorch dataloader for loading text data from text_dataset. """ def __init__(self, mode, data, num_workers=0, **kwargs): args = kwargs['args'] task = args['task'] user_idx = kwargs['user_idx'] mlm_probability = args['mlm_probability'] self.batch_size = args['batch_size'] self.mode = mode self.num_workers = num_workers self.utt_ids = None max_samples_per_user = args.get('max_samples_per_user', -1) min_words_per_utt = args.get('min_words_per_utt', 5) tokenizer_kwargs = { "cache_dir": args['cache_dir'], "use_fast": args['tokenizer_type_fast'], "use_auth_token": None } if 'tokenizer_name' in args: tokenizer = AutoTokenizer.from_pretrained(args['tokenizer_name'], **tokenizer_kwargs) elif 'model_name_or_path' in args: tokenizer = AutoTokenizer.from_pretrained(args['model_name_or_path'], **tokenizer_kwargs) else: raise ValueError("You are instantiating a new tokenizer from scratch. This is not supported by this script.") print_rank("Tokenizer is: {}".format(tokenizer), loglevel=logging.DEBUG) dataset = Dataset( data, args= args, test_only = self.mode is not 'train', tokenizer= tokenizer, user_idx=user_idx, max_samples_per_user=max_samples_per_user, min_words_per_utt=min_words_per_utt, ) self.utt_ids = dataset.user try: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm= task=='mlm', mlm_probability=mlm_probability,) except: print('There is an issue with the DataCollator .. Falling back to default_data_collator') data_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer) if self.mode == 'train': train_sampler = RandomSampler(dataset) super(DataLoader, self).__init__( dataset, batch_size=self.batch_size, sampler=train_sampler, collate_fn=data_collator, drop_last=False, num_workers=self.num_workers, pin_memory=True, ) elif self.mode == 'val' or self.mode == 'test': eval_sampler = SequentialSampler(dataset) super(DataLoader, self).__init__( dataset, sampler=eval_sampler, batch_size= self.batch_size, collate_fn=data_collator, drop_last=False, num_workers=self.num_workers, pin_memory=True, ) else: raise Exception("Sorry, there is something wrong with the 'mode'-parameter ") def get_user(self): return self.utt_ids ================================================ FILE: experiments/mlm_bert/dataloaders/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from core.dataset import BaseDataset from transformers import AutoTokenizer from utils import print_rank import logging import json import itertools class Dataset(BaseDataset): """ Map a text source to the target text """ def __init__(self, data, args, tokenizer=None, test_only=False, user_idx=0, max_samples_per_user=-1, min_words_per_utt=5, **kwargs): self.utt_list = list() self.test_only= test_only self.padding = args.get('padding', True) self.max_seq_length= args['max_seq_length'] self.max_samples_per_user = max_samples_per_user self.min_num_words = min_words_per_utt self.process_line_by_line=args.get('process_line_by_line', False) self.user = None if tokenizer != None: self.tokenizer = tokenizer else: tokenizer_kwargs = { "cache_dir": args['cache_dir'], "use_fast": args['tokenizer_type_fast'], "use_auth_token": None } if 'tokenizer_name' in args: self.tokenizer = AutoTokenizer.from_pretrained(args['tokenizer_name'], **tokenizer_kwargs) elif 'model_name_or_path' in args: self.tokenizer = AutoTokenizer.from_pretrained(args['model_name_or_path'], **tokenizer_kwargs) else: raise ValueError("You are instantiating a new tokenizer from scratch. This is not supported by this script.") if self.max_seq_length is None: self.max_seq_length = self.tokenizer.model_max_length if self.max_seq_length > 512: print_rank( f"The tokenizer picked seems to have a very large `model_max_length` ({self.tokenizer.model_max_length}). " "Picking 512 instead. You can change that default value by passing --max_seq_length xxx.", loglevel=logging.DEBUG ) self.max_seq_length = 512 else: if self.max_seq_length > self.tokenizer.model_max_length: print_rank( f"The max_seq_length passed ({self.max_seq_length}) is larger than the maximum length for the" f"model ({self.tokenizer.model_max_length}). Using max_seq_length={self.tokenizer.model_max_length}.", loglevel=logging.DEBUG ) self.max_seq_length = min(self.max_seq_length, self.tokenizer.model_max_length) self.load_data(data, user_idx) if user_idx != -1: # Avoid loading unnecessary data on memory before training if not self.process_line_by_line: self.post_process_list() def __len__(self): return len(self.utt_list) def __getitem__(self, idx): # Find the index in the available data if self.process_line_by_line: tokenized_text = LineByLineTextDataset( tokenizer=self.tokenizer, input_lines=self.utt_list[idx]['src_text'], line_by_line=True, truncation=True, max_length=self.max_seq_length, padding="max_length") self.utt_list[idx]['duration']= len(tokenized_text['input_ids']) return tokenized_text else: return self.utt_list[idx] def load_data(self, orig_strct, user_idx): """ Reads the data for a specific user (unless it's for val/testing) and returns a list of embeddings and targets.""" if isinstance(orig_strct, str): print('Loading json-file: ', orig_strct) with open(orig_strct, 'r') as fid: orig_strct = json.load(fid) self.user_list = orig_strct['users'] self.num_samples= orig_strct['num_samples'] self.user_data = orig_strct['user_data'] if user_idx != -1: # Avoid loading unnecessary data on memory before training if self.test_only: self.user = 'test_only' self.process_x(self.user_data) else: self.user = self.user_list[user_idx] self.process_x(self.user_data[self.user]) def process_x(self, raw_x_batch): if self.test_only: for i, user in enumerate(self.user_list): counter=self.process_user(user, raw_x_batch[user]) self.num_samples[i] = counter # Update userdata counter "num_samples[user]" after truncation else: counter = self.process_user(self.user, raw_x_batch) self.num_samples[self.user_list.index(self.user)] = counter # Update userdata counter "num_samples[user]" after truncation if len(self.utt_list) == 0: self.utt_list = [{'src_text': 'N/A', 'duration': 0, 'loss_weight': 1.0}] print_rank('Processing json-structure for User: {} Utterances Processed: {}'.format(self.user, len(self.utt_list)), loglevel=logging.DEBUG) def process_user(self, user, user_data): counter=0 for line in user_data: for e in line: if len(e.split()) < self.min_num_words: continue if self.max_samples_per_user > -1 and counter >= self.max_samples_per_user: print_rank('Max allowed size per user is reached for user: {}, N: {} utts, Utt_list Len: {}' \ .format(user, counter, len(self.utt_list)), loglevel=logging.DEBUG) return counter counter += 1 utt = {} utt['src_text'] = e utt['duration'] = len(e.split()) utt['loss_weight'] = 1.0 self.utt_list.append(utt) return counter def post_process_list(self): # Use only the text part of the dataset input_lines=[line['src_text'] for line in self.utt_list] # Process all lines of text print_rank('Tokenizing {} Utterances'.format(len(input_lines)), loglevel=logging.DEBUG) self.utt_list= LineByLineTextDataset(self.tokenizer, input_lines) #this one has return_special_tokens_mask as True def group_texts(examples): """"Main data processing function that will concatenate all texts from our dataset and generate chunks of max_seq_length.""" print_rank('Concatenating Frames in Sequences of {} samples'.format(self.max_seq_length), loglevel=logging.DEBUG) if self.padding: # Padding last frame total_length = sum([len(k) for k in examples['input_ids']]) print_rank('Found {} samples Before Concatenation'.format(total_length), loglevel=logging.DEBUG) padN= self.max_seq_length - (total_length % self.max_seq_length) print_rank('Padding last frame with {} samples'.format(padN), loglevel=logging.DEBUG) print_rank('keys {}'.format(examples.keys()), loglevel=logging.DEBUG) examples['input_ids'].append([self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token)]*padN) examples['attention_mask'].append([0]*padN) if 'special_tokens_mask' in examples.keys(): examples['special_tokens_mask'].append([1]*padN) if 'token_type_ids' in examples.keys(): examples['token_type_ids'].append([0]*padN) # Concatenate all input. concatenated_examples = {k: list(itertools.chain.from_iterable(examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) print_rank('Concatenated in {} Samples'.format(total_length), loglevel=logging.DEBUG) total_length = (total_length // self.max_seq_length) * self.max_seq_length print_rank('Concatenated in {} Frames'.format(total_length // self.max_seq_length), loglevel=logging.DEBUG) # Split by chunks of max_len self.utt_list=[] for i in range(0, total_length, self.max_seq_length): utt={} for k, t in concatenated_examples.items(): utt[k]= t[i : i + self.max_seq_length] self.utt_list.append(utt) print_rank('Utterance Len is: {}'.format(len(utt['input_ids'])),loglevel=logging.DEBUG) # Process list of text group_texts(self.utt_list) total_length = len(self.utt_list) print_rank('Finished Reshaping in Sequences of {} Frames'.format(total_length), loglevel=logging.INFO) # Update userdata after truncation if not self.test_only: self.num_samples[self.user_list.index(self.user)] = total_length # Not used anywhere but necessary when the dataset is initiated if total_length == 0: self.utt_list = [{"input_ids": [0, 2], "special_tokens_mask": [1, 1], "attention_mask": [0, 0]}] def LineByLineTextDataset(tokenizer, input_lines, truncation=True, max_length=512, padding = False, line_by_line=False): if input_lines==['N/A']: batch_encoding = {"input_ids": [[0, 2]], "special_tokens_mask": [[1, 1]], "attention_mask": [[0, 0]]} else: lines = [line for line in input_lines if (len(line) > 0 and not line.isspace())] print_rank ('padding is : ' + str(padding),loglevel=logging.DEBUG) print_rank ('max_length is : ' + str(max_length),loglevel=logging.DEBUG) batch_encoding = tokenizer(lines, truncation=truncation, max_length=max_length, padding = padding, return_special_tokens_mask=True,) if line_by_line: batch_encoding["input_ids"] = batch_encoding["input_ids"][0] batch_encoding["special_tokens_mask"] = batch_encoding["special_tokens_mask"][0] batch_encoding["attention_mask"] = batch_encoding["attention_mask"][0] return batch_encoding ================================================ FILE: experiments/mlm_bert/model.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch as T from utils import print_rank import logging import copy from typing import (Dict, List, Optional, Tuple, Union) from experiments.mlm_bert.utils.trainer_pt_utils import ( LabelSmoother, DistributedTensorGatherer, nested_concat, nested_detach, nested_numpify, ) from experiments.mlm_bert.utils.trainer_utils import ( EvalPrediction, ComputeMetrics) from transformers import ( MODEL_FOR_MASKED_LM_MAPPING, AutoConfig, AutoModelForMaskedLM, AutoTokenizer, set_seed, ) from utils.utils import to_device from core.model import BaseModel MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) class BERT(BaseModel): def __init__(self, model_config, **kwargs): super(BERT, self).__init__() """ from transformers import RobertaConfig config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) from transformers import RobertaTokenizerFast tokenizer = RobertaTokenizerFast.from_pretrained("./EsperBERTo", max_len=512) from transformers import RobertaForMaskedLM model = RobertaForMaskedLM(config=config) """ # Extracting model_config['BERT'] args = model_config['BERT'] # Split data to smaller configuration parameters model_args, training_args = args['model'], args['training'] # Set seed before initializing model. set_seed(training_args['seed']) self.gradient_accumulation_steps = model_args.get('gradient_accumulation_steps', 1) self.past_index = model_args.get('past_index', -1) self.prediction_loss_only = model_args.get('prediction_loss_only', True) self.eval_accumulation_steps = model_args.get('eval_accumulation_steps', None) self.label_names = model_args.get('label_names', None) self.batch_size= training_args['batch_size'] self.model_name=model_args['model_name'] if 'model_name_or_path' not in model_args: model_args['model_name_or_path']=self.model_name # Label smoothing if training_args['label_smoothing_factor'] != 0: self.label_smoother = LabelSmoother(epsilon=training_args['label_smoothing_factor']) else: self.label_smoother = None self.label_names = ( ["labels"]) if self.label_names is None else self.label_names config_kwargs = { "cache_dir": model_args['cache_dir'], "revision": None, "use_auth_token": None, } if 'config_name' in model_args: config = AutoConfig.from_pretrained(model_args['config_name'], **config_kwargs) elif 'model_name_or_path' in model_args: config = AutoConfig.from_pretrained(model_args['model_name_or_path'], **config_kwargs) else: raise ValueError( "You are instantiating a new configuration from scratch. This is not supported by this script." ) tokenizer_kwargs = { "cache_dir": model_args['cache_dir'], "use_fast": model_args['use_fast_tokenizer'], "use_auth_token": None, } if 'tokenizer_name' in model_args: tokenizer = AutoTokenizer.from_pretrained(model_args['tokenizer_name'], **tokenizer_kwargs) elif 'model_name_or_path' in model_args: print('Loading Tokenizer from Pretrained: {}'.format(model_args['model_name_or_path']) ) tokenizer = AutoTokenizer.from_pretrained(model_args['model_name_or_path'], **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." ) self.output_layer_size=len(tokenizer) if 'model_name_or_path' in model_args: print('Loading Model from Pretrained: {}'.format(model_args['model_name_or_path']) ) self.model = AutoModelForMaskedLM.from_pretrained( model_args['model_name_or_path'], from_tf=False, config=config, cache_dir=model_args['cache_dir'], use_auth_token=None, ) if 'adapter' in model_args: if model_args['adapter']: self.model.add_adapter("FLUTE") #Activate the adapter self.model.train_adapter("FLUTE") else: raise ValueError( "You are instantiating a new model from scratch. This is not supported by this script." ) self.model.resize_token_embeddings(self.output_layer_size) total_params = 0 trainable_params = 0 for p in self.model.parameters(): total_params += p.numel() if p.requires_grad: trainable_params += p.numel() print_rank(f"Total parameters count: {total_params}", loglevel=logging.DEBUG) # ~109M print_rank(f"Trainable parameters count: {trainable_params}", loglevel=logging.DEBUG) # ~1M print_rank(f"Original Bert parameters count: {total_params-trainable_params}", loglevel=logging.DEBUG) # ~1M def copy_state_dict(self, state_dict): self.model.state_dict=state_dict.clone() def get_model(self): return self.model def _prepare_inputs(self, inputs): """ Prepare :obj:`inputs` before feeding them to the model, converting them to tensors if they are not already and handling potential state. """ for k, v in inputs.items(): if isinstance(v, T.Tensor): inputs[k] = to_device(v) if self.past_index >= 0 and self._past is not None: inputs["mems"] = self._past return inputs def forward(self, inputs): inputs = self._prepare_inputs(inputs) return self.model(**inputs) def loss(self, inputs): """ Perform a training step on a batch of inputs. Subclass and override to inject custom behavior. Args: model (:obj:`nn.Module`): The model to train. inputs (:obj:`Dict[str, Union[T.Tensor, Any]]`): The inputs and targets of the model. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the argument :obj:`labels`. Check your model's documentation for all accepted arguments. Return: :obj:`T.Tensor`: The tensor with training loss on this batch. """ inputs = self._prepare_inputs(inputs) loss = self.compute_loss(inputs) loss = loss / self.gradient_accumulation_steps return loss def compute_loss(self, inputs_orig, return_outputs=False): """ How the loss is computed by Trainer. By default, all models return the loss in the first element. Subclass and override for custom behavior. inputs (:obj:`Dict[str, Union[T.Tensor, Any]]`): The inputs and targets of the model. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the argument :obj:`labels`. Check your model's documentation for all accepted arguments. """ # Copy a local copy of the data inputs=copy.deepcopy(inputs_orig) if self.label_smoother is not None and "labels" in inputs: labels = inputs["labels"].detach().cpu() else: labels = None # The following fields need to be removed for Roberta if 'roberta' in self.model_name: #print("here") if 'attention_mask' in inputs: inputs.pop('attention_mask') if 'special_tokens_mask' in inputs: inputs.pop('special_tokens_mask') # Forward pass for the transformer outputs = self.model(**inputs) if self.past_index >= 0: self._past = outputs[self.past_index] if labels is not None: loss = self.label_smoother(outputs, labels) else: # We don't use .loss here since the model may return tuples instead of ModelOutput. loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] return (loss, outputs) if return_outputs else loss def inference( self, inputs, ignore_keys: Optional[List[str]] = [], metric_key_prefix: str = "eval" ) -> List[float]: """ Run prediction and returns predictions and potential metrics. Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in :obj:`evaluate()`. Args: inputs (:obj:`Dict[str, Union[T.Tensor, Any]]`): The inputs and targets of the model. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the argument :obj:`labels`. Check your model's documentation for all accepted arguments. ignore_keys (:obj:`Lst[str]`, `optional`): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`): An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named "eval_bleu" if the prefix is "eval" (default) .. note:: If your predictions or labels have different sequence length (for instance because you're doing dynamic padding in a token classification task) the predictions will be padded (on the right) to allow for concatenation into one array. The padding index is -100. Returns: `NamedTuple` A namedtuple with the following keys: - predictions (:obj:`np.ndarray`): The predictions on :obj:`test_dataset`. - label_ids (:obj:`np.ndarray`, `optional`): The labels (if the dataset contained some). - metrics (:obj:`Dict[str, float]`, `optional`): The potential dictionary of metrics (if the dataset contained labels). """ output, batch_size = self.prediction_loop( inputs, description="Evaluation", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) return {'output':output['eval_loss'], 'acc': output['eval_acc'], 'batch_size': batch_size[0]} def prediction_loop( self, inputs, description: str, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", ) -> Union[Dict, List[int]]: """ Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. Works both with or without labels. """ out_label_ids=None if 'labels' in inputs: out_label_ids = inputs['labels'].detach().cpu() if 'attention_mask' in inputs: attention_mask= inputs['attention_mask'].detach().cpu() losses_host = None preds_host = None labels_host = None world_size = 1 num_hosts = 1 eval_losses_gatherer = DistributedTensorGatherer(world_size, num_hosts, make_multiple_of=self.batch_size) if not self.prediction_loss_only: preds_gatherer = DistributedTensorGatherer(world_size, num_hosts) labels_gatherer = DistributedTensorGatherer(world_size, num_hosts) self.model.eval() if self.past_index >= 0: self._past = None loss, logits, _ = self.prediction_step(inputs, ignore_keys=ignore_keys, has_labels=True) if loss is not None: losses = loss.repeat(self.batch_size).cpu() losses_host = losses if losses_host is None else T.cat((losses_host, losses), dim=0) if logits is not None: preds_host = logits.detach().cpu() if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) if out_label_ids is not None: labels_host = out_label_ids if labels_host is None else nested_concat(labels_host, out_label_ids, padding_index=-100) # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. if self.eval_accumulation_steps is not None : eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) if not self.prediction_loss_only: preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) # Set back to None to begin a new accumulation losses_host, preds_host, labels_host = None, None, None if self.past_index and hasattr(self, "_past"): # Clean the state at the end of the evaluation loop delattr(self, "_past") # Gather all remaining tensors and put them back on the CPU if num_hosts>1: eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"), want_masked=True) if not self.prediction_loss_only: preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) eval_loss = eval_losses_gatherer.finalize() preds = preds_gatherer.finalize() if not self.prediction_loss_only else None label_ids = labels_gatherer.finalize() if not self.prediction_loss_only else None else: eval_loss= losses_host preds = preds_host label_ids= labels_host if preds is not None and label_ids is not None: metrics = ComputeMetrics.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids), attention_mask) else: metrics = {} if eval_loss is not None: metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item() # Prefix all keys with metric_key_prefix + '_' for key in list(metrics.keys()): if not key.startswith(f"{metric_key_prefix}_"): metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key).item() return metrics, preds.size() def _gather_and_numpify(self, tensors, name): """ Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before concatenating them to `gathered` """ if tensors is None: return return nested_numpify(tensors) def prediction_step( self, inputs, ignore_keys: Optional[List[str]] = None, has_labels: bool = None ) -> Tuple[Optional[float], Optional[T.Tensor], Optional[T.Tensor]]: """ Perform an evaluation step on :obj:`model` using obj:`inputs`. Subclass and override to inject custom behavior. Args: model (:obj:`nn.Module`): The model to evaluate. inputs (:obj:`Dict[str, Union[T.Tensor, Any]]`): The inputs and targets of the model. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the argument :obj:`labels`. Check your model's documentation for all accepted arguments. prediction_loss_only (:obj:`bool`): Whether or not to return the loss only. ignore_keys (:obj:`Lst[str]`, `optional`): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. Return: Tuple[Optional[float], Optional[T.Tensor], Optional[T.Tensor]]: A tuple with the loss, logits and labels (each being optional). """ inputs = self._prepare_inputs(inputs) # labels may be popped when computing the loss (label smoothing for instance) so we grab them first. if has_labels: #labels = nested_detach(tuple(inputs.get(name) for name in self.label_names)) labels = inputs["labels"].detach().cpu() if len(labels) == 1: labels = labels[0] else: labels = None with T.no_grad(): if has_labels: loss, outputs = self.compute_loss(inputs, return_outputs=True) loss = loss.mean().detach() if isinstance(outputs, dict): logits = outputs["logits"] else: logits = outputs[1:] else: loss = None outputs = self.model(**inputs) if isinstance(outputs, dict): logits = tuple(v for k, v in outputs.items() if k not in ignore_keys) else: logits = outputs if self.past_index >= 0: self._past = outputs[self.past_index - 1] if self.prediction_loss_only: return (loss, None, None) logits = nested_detach(logits) if len(logits) == 1: logits = logits[0] return (loss, logits, labels) def floating_point_ops(self, inputs): """ For models that inherit from :class:`~transformers.PreTrainedModel`, uses that method to compute the number of floating point operations for every backward + forward pass. If using another model, either implement such a method in the model or subclass and override this method. Args: inputs (:obj:`Dict[str, Union[T.Tensor, Any]]`): The inputs and targets of the model. Returns: :obj:`int`: The number of floating-point operations. """ if hasattr(self.model, "floating_point_ops"): return self.model.floating_point_ops(inputs) else: return 0 def set_eval(self): """ Bring the model into evaluation mode """ self.model.eval() def set_train(self): """ Bring the model into train mode """ self.model.train() ================================================ FILE: experiments/mlm_bert/utils/trainer_pt_utils.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. # coding=utf-8 # Copyright 2020-present the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Torch utilities for the Trainer class. """ import json import math import os import warnings from contextlib import contextmanager from dataclasses import dataclass from typing import Dict, Iterator, List, Optional, Union import numpy as np import torch from packaging import version from torch.utils.data.dataset import Dataset from torch.utils.data.distributed import DistributedSampler from torch.utils.data.sampler import RandomSampler, Sampler # this is used to supress an undesired warning emitted by pytorch versions 1.4.2-1.7.0 try: from torch.optim.lr_scheduler import SAVE_STATE_WARNING except ImportError: SAVE_STATE_WARNING = "" def torch_pad_and_concatenate(tensor1, tensor2, padding_index=-100): """Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary.""" if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]: return torch.cat((tensor1, tensor2), dim=0) # Let's figure out the new shape new_shape = (tensor1.shape[0] + tensor2.shape[0], max(tensor1.shape[1], tensor2.shape[1])) + tensor1.shape[2:] # Now let's fill the result tensor result = tensor1.new_full(new_shape, padding_index) result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1 result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2 return result def numpy_pad_and_concatenate(array1, array2, padding_index=-100): """Concatenates `array1` and `array2` on first axis, applying padding on the second if necessary.""" if len(array1.shape) == 1 or array1.shape[1] == array2.shape[1]: return np.concatenate((array1, array2), dim=0) # Let's figure out the new shape new_shape = (array1.shape[0] + array2.shape[0], max(array1.shape[1], array2.shape[1])) + array1.shape[2:] # Now let's fill the result tensor result = np.full_like(array1, padding_index, shape=new_shape) result[: array1.shape[0], : array1.shape[1]] = array1 result[array1.shape[0] :, : array2.shape[1]] = array2 return result def nested_concat(tensors, new_tensors, padding_index=-100): """ Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or nested list/tuples of tensors. """ assert type(tensors) == type( new_tensors ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}." if isinstance(tensors, (list, tuple)): return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors)) elif isinstance(tensors, torch.Tensor): return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index) elif isinstance(tensors, np.ndarray): return numpy_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index) else: raise TypeError(f"Unsupported type for concatenation: got {type(tensors)}") def nested_numpify(tensors): "Numpify `tensors` (even if it's a nested list/tuple of tensors)." if isinstance(tensors, (list, tuple)): return type(tensors)(nested_numpify(t) for t in tensors) return tensors.cpu().numpy() def nested_detach(tensors): "Detach `tensors` (even if it's a nested list/tuple of tensors)." if isinstance(tensors, (list, tuple)): return type(tensors)(nested_detach(t) for t in tensors) return tensors.detach() def reissue_pt_warnings(caught_warnings): # Reissue warnings that are not the SAVE_STATE_WARNING if len(caught_warnings) > 1: for w in caught_warnings: if w.category != UserWarning or w.message != SAVE_STATE_WARNING: warnings.warn(w.message, w.category) def nested_new_like(arrays, num_samples, padding_index=-100): """ Create the same nested structure as `arrays` with a first dimension always at `num_samples`.""" if isinstance(arrays, (list, tuple)): return type(arrays)(nested_new_like(x, num_samples) for x in arrays) return np.full_like(arrays, padding_index, shape=(num_samples, *arrays.shape[1:])) def nested_expand_like(arrays, new_seq_length, padding_index=-100): """ Expand the `arrays` so that the second dimension grows to `new_seq_length`. Uses `padding_index` for padding.""" if isinstance(arrays, (list, tuple)): return type(arrays)(nested_expand_like(x, new_seq_length, padding_index=padding_index) for x in arrays) result = np.full_like(arrays, padding_index, shape=(arrays.shape[0], new_seq_length) + arrays.shape[2:]) result[:, : arrays.shape[1]] = arrays return result def nested_truncate(tensors, limit): "Truncate `tensors` at `limit` (even if it's a nested list/tuple of tensors)." if isinstance(tensors, (list, tuple)): return type(tensors)(nested_truncate(t, limit) for t in tensors) return tensors[:limit] def _get_first_shape(arrays): """Return the shape of the first array found in the nested struct `arrays`.""" if isinstance(arrays, (list, tuple)): return _get_first_shape(arrays[0]) return arrays.shape class DistributedTensorGatherer: """ A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU by chunks. If our dataset has 16 samples with a batch size of 2 on 3 processes and we gather then transfer on CPU at every step, our sampler will generate the following indices: :obj:`[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]` to get something of size a multiple of 3 (so that each process gets the same dataset length). Then process 0, 1 and 2 will be responsible of making predictions for the following samples: - P0: :obj:`[0, 1, 2, 3, 4, 5]` - P1: :obj:`[6, 7, 8, 9, 10, 11]` - P2: :obj:`[12, 13, 14, 15, 0, 1]` The first batch treated on each process will be - P0: :obj:`[0, 1]` - P1: :obj:`[6, 7]` - P2: :obj:`[12, 13]` So if we gather at the end of the first batch, we will get a tensor (nested list/tuple of tensor) corresponding to the following indices: :obj:`[0, 1, 6, 7, 12, 13]` If we directly concatenate our results without taking any precautions, the user will then get the predictions for the indices in this order at the end of the prediction loop: :obj:`[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]` For some reason, that's not going to roll their boat. This class is there to solve that problem. Args: world_size (:obj:`int`): The number of processes used in the distributed training. num_samples (:obj:`int`): The number of samples in our dataset. make_multiple_of (:obj:`int`, `optional`): If passed, the class assumes the datasets passed to each process are made to be a multiple of this argument (by adding samples). padding_index (:obj:`int`, `optional`, defaults to -100): The padding index to use if the arrays don't all have the same sequence length. """ def __init__(self, world_size, num_samples, make_multiple_of=None, padding_index=-100): self.world_size = world_size self.num_samples = num_samples total_size = world_size if make_multiple_of is None else world_size * make_multiple_of self.total_samples = int(np.ceil(num_samples / total_size)) * total_size self.process_length = self.total_samples // world_size self._storage = None self._offsets = None self.padding_index = padding_index def add_arrays(self, arrays): """ Add :obj:`arrays` to the internal storage, Will initialize the storage to the full size at the first arrays passed so that if we're bound to get an OOM, it happens at the beginning. """ if arrays is None: return if self._storage is None: self._storage = nested_new_like(arrays, self.total_samples, padding_index=self.padding_index) self._offsets = list(range(0, self.total_samples, self.process_length)) else: storage_shape = _get_first_shape(self._storage) arrays_shape = _get_first_shape(arrays) if len(storage_shape) > 1 and storage_shape[1] < arrays_shape[1]: # If we get new arrays that are too big too fit, we expand the shape fo the storage self._storage = nested_expand_like(self._storage, arrays_shape[1], padding_index=self.padding_index) slice_len = self._nested_set_tensors(self._storage, arrays) for i in range(self.world_size): self._offsets[i] += slice_len def _nested_set_tensors(self, storage, arrays): if isinstance(arrays, (list, tuple)): for x, y in zip(storage, arrays): slice_len = self._nested_set_tensors(x, y) return slice_len assert ( arrays.shape[0] % self.world_size == 0 ), f"Arrays passed should all have a first dimension multiple of {self.world_size}, found {arrays.shape[0]}." slice_len = arrays.shape[0] // self.world_size for i in range(self.world_size): if len(arrays.shape) == 1: storage[self._offsets[i] : self._offsets[i] + slice_len] = arrays[i * slice_len : (i + 1) * slice_len] else: storage[self._offsets[i] : self._offsets[i] + slice_len, : arrays.shape[1]] = arrays[ i * slice_len : (i + 1) * slice_len ] return slice_len def finalize(self): """ Return the properly gathered arrays and truncate to the number of samples (since the sampler added some extras to get each process a dataset of the same length). """ if self._storage is None: return if self._offsets[0] != self.process_length: logger.warn("Not all data has been set. Are you sure you passed all values?") return nested_truncate(self._storage, self.num_samples) @dataclass class LabelSmoother: """ Adds label-smoothing on a pre-computed output from a Transformers model. Args: epsilon (:obj:`float`, `optional`, defaults to 0.1): The label smoothing factor. ignore_index (:obj:`int`, `optional`, defaults to -100): The index in the labels to ignore when computing the loss. """ epsilon: float = 0.1 ignore_index: int = -100 def __call__(self, model_output, labels): logits = model_output["logits"] if isinstance(model_output, dict) else model_output[0] log_probs = -torch.nn.functional.log_softmax(logits, dim=-1) if labels.dim() == log_probs.dim() - 1: labels = labels.unsqueeze(-1) padding_mask = labels.eq(self.ignore_index) # In case the ignore_index is -100, the gather will fail, so we replace labels by 0. The padding_mask # will ignore them in any case. labels.clamp_min_(0) nll_loss = log_probs.gather(dim=-1, index=labels) smoothed_loss = log_probs.sum(dim=-1, keepdim=True) nll_loss.masked_fill_(padding_mask, 0.0) smoothed_loss.masked_fill_(padding_mask, 0.0) # Take the mean over the label dimensions, then divide by the number of active elements (i.e. not-padded): num_active_elements = padding_mask.numel() - padding_mask.long().sum() nll_loss = nll_loss.sum() / num_active_elements smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1]) return (1 - self.epsilon) * nll_loss + self.epsilon * smoothed_loss def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, generator=None): """ Return a list of indices so that each slice of :obj:`batch_size` consecutive indices correspond to elements of similar lengths. To do this, the indices are: - randomly permuted - grouped in mega-batches of size :obj:`mega_batch_mult * batch_size` - sorted by length in each mega-batch The result is the concatenation of all mega-batches, with the batch of :obj:`batch_size` containing the element of maximum length placed first, so that an OOM happens sooner rather than later. """ # Default for mega_batch_mult: 50 or the number to get 4 megabatches, whichever is smaller. if mega_batch_mult is None: mega_batch_mult = min(len(lengths) // (batch_size * 4), 50) # Just in case, for tiny datasets if mega_batch_mult == 0: mega_batch_mult = 1 # We need to use torch for the random part as a distributed sampler will set the random seed for torch. indices = torch.randperm(len(lengths), generator=generator) megabatch_size = mega_batch_mult * batch_size megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)] megabatches = [list(sorted(megabatch, key=lambda i: lengths[i], reverse=True)) for megabatch in megabatches] # The rest is to get the biggest batch first. # Since each megabatch is sorted by descending length, the longest element is the first megabatch_maximums = [lengths[megabatch[0]] for megabatch in megabatches] max_idx = torch.argmax(torch.tensor(megabatch_maximums)).item() # Switch to put the longest element in first position megabatches[0][0], megabatches[max_idx][0] = megabatches[max_idx][0], megabatches[0][0] return sum(megabatches, []) class LengthGroupedSampler(Sampler): r""" Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while keeping a bit of randomness. """ def __init__(self, dataset: Dataset, batch_size: int, lengths: Optional[List[int]] = None): self.dataset = dataset self.batch_size = batch_size if lengths is None: if not isinstance(dataset[0], dict) or "input_ids" not in dataset[0]: raise ValueError( "Can only automatically infer lengths for datasets whose items are dictionaries with an " "'input_ids' key." ) lengths = [len(feature["input_ids"]) for feature in dataset] self.lengths = lengths def __len__(self): return len(self.lengths) def __iter__(self): indices = get_length_grouped_indices(self.lengths, self.batch_size) return iter(indices) class DistributedLengthGroupedSampler(DistributedSampler): r""" Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while keeping a bit of randomness. """ # Copied and adapted from PyTorch DistributedSampler. def __init__( self, dataset: Dataset, batch_size: int, num_replicas: Optional[int] = None, rank: Optional[int] = None, seed: int = 0, drop_last: bool = False, lengths: Optional[List[int]] = None, ): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") num_replicas = dist.get_world_size() if rank is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() self.dataset = dataset self.batch_size = batch_size self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.drop_last = drop_last # If the dataset length is evenly divisible by # of replicas, then there # is no need to drop any data, since the dataset will be split equally. if self.drop_last and len(self.dataset) % self.num_replicas != 0: # Split to nearest available length that is evenly divisible. # This is to ensure each rank receives the same amount of data when # using this Sampler. self.num_samples = math.ceil((len(self.dataset) - self.num_replicas) / self.num_replicas) else: self.num_samples = math.ceil(len(self.dataset) / self.num_replicas) self.total_size = self.num_samples * self.num_replicas self.seed = seed if lengths is None: if not isinstance(dataset[0], dict) or "input_ids" not in dataset[0]: raise ValueError( "Can only automatically infer lengths for datasets whose items are dictionaries with an " "'input_ids' key." ) lengths = [len(feature["input_ids"]) for feature in dataset] self.lengths = lengths def __iter__(self) -> Iterator: # Deterministically shuffle based on epoch and seed g = torch.Generator() g.manual_seed(self.seed + self.epoch) indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=g) if not self.drop_last: # add extra samples to make it evenly divisible indices += indices[: (self.total_size - len(indices))] else: # remove tail of data to make it evenly divisible. indices = indices[: self.total_size] assert len(indices) == self.total_size # subsample indices = indices[self.rank : self.total_size : self.num_replicas] assert len(indices) == self.num_samples return iter(indices) # In order to keep `trainer.py` compact and easy to understand, place any secondary PT Trainer # helper methods here def _get_learning_rate(self): if self.deepspeed: # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may # not run for the first few dozen steps while loss scale is too large, and thus during # that time `get_last_lr` will fail if called during that warm up stage, so work around it: try: last_lr = self.lr_scheduler.get_last_lr()[0] except AssertionError as e: if "need to call step" in str(e): logger.warn("tried to get lr value before scheduler/optimizer started stepping, returning lr=0") last_lr = 0 else: raise else: last_lr = ( # backward compatibility for pytorch schedulers self.lr_scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else self.lr_scheduler.get_lr()[0] ) return last_lr def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]: """ Reformat Trainer metrics values to a human-readable format Args: metrics (:obj:`Dict[str, float]`): The metrics returned from train/evaluate/predict Returns: metrics (:obj:`Dict[str, float]`): The reformatted metrics """ metrics_copy = metrics.copy() for k, v in metrics_copy.items(): if "_mem_" in k: metrics_copy[k] = f"{ v >> 20 }MB" elif k == "total_flos": metrics_copy[k] = f"{ int(v) >> 30 }GF" elif type(metrics_copy[k]) == float: metrics_copy[k] = round(v, 4) return metrics_copy def log_metrics(self, split, metrics): """ Log metrics in a specially formatted way Args: split (:obj:`str`): Mode/split name: one of ``train``, ``eval``, ``test`` metrics (:obj:`Dict[str, float]`): The metrics returned from train/evaluate/predictmetrics: metrics dict """ logger.info(f"***** {split} metrics *****") metrics_formatted = self.metrics_format(metrics) k_width = max(len(str(x)) for x in metrics_formatted.keys()) v_width = max(len(str(x)) for x in metrics_formatted.values()) for key in sorted(metrics_formatted.keys()): logger.info(f" {key: <{k_width}} = {metrics_formatted[key]:>{v_width}}") def save_metrics(self, split, metrics): """ Save metrics into a json file for that split, e.g. ``train_results.json``. Args: split (:obj:`str`): Mode/split name: one of ``train``, ``eval``, ``test``, ``all`` metrics (:obj:`Dict[str, float]`): The metrics returned from train/evaluate/predict """ path = os.path.join(self.args.output_dir, f"{split}_results.json") with open(path, "w") as f: json.dump(metrics, f, indent=4, sort_keys=True) ================================================ FILE: experiments/mlm_bert/utils/trainer_utils.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. # coding=utf-8 # Copyright 2020-present the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Utilities for the Trainer and TFTrainer class. Should be independent from PyTorch and TensorFlow. """ import random from typing import Any, Dict, NamedTuple, Optional, Tuple, Union import numpy as np import torch import logging from utils import print_rank def set_seed(seed: int): """ Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if installed). Args: seed (:obj:`int`): The seed to set. """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # ^^ safe to call this function even if cuda is not available class EvalPrediction(NamedTuple): """ Evaluation output (always contains labels), to be used to compute metrics. Parameters: predictions (:obj:`np.ndarray`): Predictions of the model. label_ids (:obj:`np.ndarray`): Targets to be matched. """ predictions: Union[np.ndarray, Tuple[np.ndarray]] label_ids: np.ndarray class PredictionOutput(NamedTuple): predictions: Union[np.ndarray, Tuple[np.ndarray]] label_ids: Optional[np.ndarray] metrics: Optional[Dict[str, float]] class ComputeMetrics: def __init__(self, p: EvalPrediction, mask=None): self.EvalPrediction = EvalPrediction self.compute_metrics( self.EvalPrediction) @staticmethod def compute_metrics(p: EvalPrediction, mask=None): print_rank('Prediction Block Size: {}'.format(p.predictions.size()), loglevel=logging.DEBUG) if len(list(p.predictions.size()))<3: if len(list(p.predictions.size()))<2: print_rank('There is something REALLY wrong with prediction tensor:'.format(p.predictions.size()), loglevel=logging.INFO) return {'acc': torch.tensor(0.0)} print_rank('There is something wrong with prediction tensor:'.format(p.predictions.size()), loglevel=logging.INFO) preds = np.argmax(p.predictions, axis=1) else: preds = np.argmax(p.predictions, axis=2) if mask is None: return {'acc': (preds == p.label_ids).float().mean()} else: #valid = preds >1 # reject oov predictions even if they're correct. valid = mask==1 return {'acc': (preds.eq(p.label_ids.cpu()) * valid.cpu()).float().mean()} ================================================ FILE: experiments/nlg_gru/README.md ================================================ # Simple example of a NLG task on Reddit Dataset Instructions on how to run the experiment, given below. ## Preparing the data For this experiment, we can create a dummy dataset by running the script located in `testing/create_data.py` as follows: ```code python create_data.py --task nlg_gru ``` A couple of scripts are provided in `utils/preprocessing` for preprocessing .tsv files in case you want to use your own data. ## Creating a config file All the parameters of the experiment are passed in a YAML file. An basic example is provided in `configs/hello_world_nlg_gru_json.yaml` with the suggested parameters for local runs. The example provided above is for running json files. If you want to try with HDF5 files make sure to use the script `utils/preprocessing/from_json_to_hdf5.py` to convert the mock data to HDF5 format. ## Running the experiment Finally, to launch the experiment locally , it suffices to launch the `e2e_trainer.py` script using torch.distributed , you can use as example the following line: ```code python -m torch.distributed.run --nproc_per_node=3 e2e_trainer.py -dataPath .\testing\mockup\ -outputPath scratch -config .\testing\configs\hello_world_nlg_gru.yaml -task nlg_gru -backend nccl ``` For submitting jobs in Azure ML, we have included the instructions in the `Experiments` section of the main `README.md`. ================================================ FILE: experiments/nlg_gru/config.py ================================================ from __future__ import annotations from dataclasses import dataclass import sys sys.path.append('../../') from core.config import ModelConfig, from_dict @dataclass class GRUConfig(ModelConfig): """nlg_gru configuration The model configuration specifies model architecture, parameters, and initialization settings. Attributes: embed_dim (int): specific to GRU models, embedding dimension. vocab_size (int): specific to GRU models, the vocabulary size. hidden_dim (int): specific to GRU models, the hidden size. weight_init (str): ``default``, or ``xavier_normal``, indicating how to randomly initialize the model weights. OOV_correct (bool): whether OOV predictions are evaluated as correct, or ignored. """ embed_dim: int | None = None vocab_size: int | None = None hidden_dim: int | None = None weight_init: str = None OOV_correct: bool = False @staticmethod def from_dict(config) -> GRUConfig: return from_dict(GRUConfig, config) ================================================ FILE: experiments/nlg_gru/dataloaders/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import random import torch import numpy as np from core.dataloader import BaseDataLoader from torch.utils.data.distributed import DistributedSampler from experiments.nlg_gru.dataloaders.dataset import Dataset from utils.data_utils import BatchSampler, DynamicBatchSampler class DataLoader(BaseDataLoader): """ PyTorch dataloader for loading text data from text_dataset. """ def __init__(self, mode, num_workers=0, **kwargs): args = kwargs['args'] self.batch_size = args['batch_size'] batch_sampler = None dataset = Dataset( data = kwargs['data'], test_only = not mode=="train", vocab_dict = args['vocab_dict'], user_idx = kwargs['user_idx'], max_num_words= args['max_num_words'], preencoded = args.get('preencoded', False)) if mode == 'train': sampler = DistributedSampler(dataset,num_replicas=1,rank=0) sampler.set_epoch(random.randint(0, 10**10)) batch_sampler = DynamicBatchSampler(sampler, frames_threshold = args['max_num_words'], max_batch_size = self.batch_size, unsorted_batch = args['unsorted_batch'], fps=1) elif mode == 'val' or mode == 'test': sampler = BatchSampler(dataset, batch_size=self.batch_size, randomize=False, drop_last=False) super().__init__(dataset, batch_sampler=sampler, num_workers=num_workers, collate_fn=self.collate_fn, pin_memory=args["pin_memory"]) return if batch_sampler is None: super().__init__(dataset, batch_size=self.batch_size, sampler=sampler, num_workers=num_workers, collate_fn=self.collate_fn, drop_last=True) else: super().__init__(dataset, batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=self.collate_fn, pin_memory=args["pin_memory"]) def collate_fn(self, batch): def pad_and_concat_feats(labels): batch_size = len(labels) max_len = max(len(l[0]) for l in labels) cat_labels = np.full((batch_size, max_len), -1) for e, l in enumerate(labels): cat_labels[e,:len(l[0])] = np.squeeze(l) return cat_labels src_seq, utt_ids = zip(*batch) x_len = [len(s[0]) for s in src_seq] src_seq = pad_and_concat_feats(src_seq) packed = { 'x': torch.from_numpy(src_seq).long(), 'x_len': x_len, 'utt_ids' : utt_ids, 'total_frames' : sum(x_len), 'total_frames_with_padding' : np.prod(src_seq.shape), 'loss_weight' : None } return packed ================================================ FILE: experiments/nlg_gru/dataloaders/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np import logging import json from utils import print_rank from core.dataset import BaseDataset from experiments.nlg_gru.utils.utility import * class Dataset(BaseDataset): """ Map a text source to the target text """ def __init__(self, data, min_num_words=2, max_num_words=25, test_only=False, user_idx=0, vocab_dict=None, preencoded=False, **kwargs): self.utt_list = list() self.test_only = test_only self.max_num_words = max_num_words self.min_num_words = min_num_words self.preencoded = preencoded # Load the vocab self.vocab = load_vocab(kwargs['args']['vocab_dict']) if 'args' in kwargs else load_vocab(vocab_dict) self.vocab_size = len(self.vocab) # reading the jsonl for a specific user_idx self.load_data(data, user_idx) def __len__(self): """Return the length of the elements in the list.""" return len(self.utt_list) def __getitem__(self, idx): """Find the index in the available data""" if self.preencoded: batch = np.array([self.utt_list[idx]['src_text']], dtype=np.int32) else: # case_backoff_batch tries to find the best capitalisation that will allow the word to be in vocabulary batch = case_backoff_batch([self.utt_list[idx]['src_text']], self.vocab.term_to_idx) batch = to_indices(self.vocab, batch) return batch, self.user def load_data(self, orig_strct, user_idx): if isinstance(orig_strct, str): print('Loading json-file: ', orig_strct) with open(orig_strct, 'r') as fid: orig_strct = json.load(fid) self.user_list = orig_strct['users'] self.num_samples = orig_strct['num_samples'] self.user_data = orig_strct['user_data'] self.user = 'test_only' if self.test_only else self.user_list[user_idx] if user_idx != -1: self.process_x(self.user_data) def process_x(self, user_data): print_rank('Processing data-structure: {} Utterances expected'.format(sum(self.num_samples)), loglevel=logging.DEBUG) for user in self.user_list: for e in user_data[user]['x']: utt={} utt['src_text'] = e if type(e) is list else e.split() utt['duration'] = len(e) if utt['duration']<= self.min_num_words: continue if utt['duration'] > self.max_num_words: utt['src_text'] = utt['src_text'][:self.max_num_words] utt['duration'] = self.max_num_words utt["loss_weight"] = 1.0 self.utt_list.append(utt) ================================================ FILE: experiments/nlg_gru/model.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch as T from torch import Tensor from typing import List, Tuple from core.model import BaseModel from utils import softmax, to_device class GRU2(T.nn.Module): def __init__(self, input_size, hidden_size, input_bias, hidden_bias): super(GRU2, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.w_ih = T.nn.Linear(input_size, 3 * hidden_size, input_bias) self.w_hh = T.nn.Linear(hidden_size, 3 * hidden_size, hidden_bias) def _forward_cell(self, input : Tensor, hidden : Tensor) -> Tensor: g_i = self.w_ih(input) g_h = self.w_hh(hidden) i_r, i_i, i_n = g_i.chunk(3, 1) h_r, h_i, h_n = g_h.chunk(3, 1) reset_gate = T.sigmoid(i_r + h_r) input_gate = T.sigmoid(i_i + h_i) new_gate = T.tanh(i_n + reset_gate * h_n) hy = new_gate + input_gate * (hidden - new_gate) return hy def forward(self, input : Tensor) -> Tuple[Tensor, Tensor]: hiddens : List[Tensor] = [to_device(T.zeros((input.shape[0], self.hidden_size)))] for step in range(input.shape[1]): hidden = self._forward_cell(input[:, step], hiddens[-1]) hiddens.append(hidden) return T.stack(hiddens, dim=1), hiddens[-1] class Embedding(T.nn.Module): def __init__(self, vocab_size, embedding_size): super(Embedding, self).__init__() self.vocab_size = vocab_size self.embedding_size = embedding_size self.table = T.nn.Parameter(T.zeros((vocab_size, embedding_size))) self.unembedding_bias = T.nn.Parameter(T.zeros(vocab_size)) delta = (3 / self.table.shape[1]) ** 0.5 T.nn.init.uniform_(self.table, -delta, delta) def forward(self, input : Tensor, embed : bool) -> Tensor: if embed: output = T.nn.functional.embedding(input, self.table) else: output = input @ self.table.t() + self.unembedding_bias return output class GRU(BaseModel): #DLM_2_0 def __init__(self, model_config, OOV_correct=False, dropout=0.0, topK_results=1, wantLogits=False, **kwargs): super(GRU, self).__init__() self.vocab_size = model_config['vocab_size'] self.embedding_size = model_config['embed_dim'] self.hidden_size = model_config['hidden_dim'] self.embedding = Embedding(self.vocab_size, self.embedding_size) self.rnn = GRU2(self.embedding_size, self.hidden_size, True, True) self.squeeze = T.nn.Linear(self.hidden_size, self.embedding_size, bias=False) self.OOV_correct = OOV_correct self.topK_results = topK_results self.dropout=dropout self.wantLogits=wantLogits if self.dropout>0.0: self.drop_layer = T.nn.Dropout(p=self.dropout) def forward(self, input : T.Tensor) -> Tuple[Tensor, Tensor]: input = input['x'] if isinstance(input, dict) else input input = to_device(input) embedding = self.embedding(input, True) hiddens, state = self.rnn(embedding) if self.dropout>0.0: hiddens= self.drop_layer(hiddens) output = self.embedding(self.squeeze(hiddens), False) return output, state def loss(self, input : T.Tensor) -> T.Tensor: input = input['x'] if isinstance(input, dict) else input input = to_device(input) non_pad_mask = input >= 0 input = input * non_pad_mask.long() non_pad_mask = non_pad_mask.view(-1) # Run the forward pass output, _ = self.forward(input[:, :-1]) # Estimate the targets targets = input.view(-1)[non_pad_mask] preds = output.view(-1, self.vocab_size)[non_pad_mask] # Estimate the loss return T.nn.functional.cross_entropy(preds, targets) def inference(self, input): input = input['x'] if isinstance(input, dict) else input input = to_device(input) non_pad_mask = input >= 0 input = input * non_pad_mask.long() non_pad_mask = non_pad_mask.view(-1) output, _ = self.forward(input[:, :-1]) # Apply mask to input/output targets = input.view(-1)[non_pad_mask] preds = output.view(-1, self.vocab_size)[non_pad_mask] # accuracy probs_topK, preds_topK = T.topk(preds, self.topK_results, sorted=True, dim=1) probs, preds = probs_topK[:,0], preds_topK[:,0] if self.OOV_correct: acc = preds.eq(targets).float().mean() else: valid = preds != 0 # reject oov predictions even if they're correct. acc = (preds.eq(targets) * valid).float().mean() if self.wantLogits: if 1: output= {'probabilities': softmax(probs_topK.cpu().detach().numpy(), axis=1), 'predictions': preds_topK.cpu().detach().numpy(), 'labels': targets.cpu().detach().numpy()} else: output = {'probabilities': probs_topK.cpu().detach().numpy(), 'predictions': preds_topK.cpu().detach().numpy(), 'labels': targets.cpu().detach().numpy()} return {'output':output, 'acc': acc.item(), 'batch_size': input.shape[0]} ================================================ FILE: experiments/nlg_gru/utils/utility.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import os import json import time from argparse import ArgumentParser import numpy as np from collections import namedtuple from tqdm import tqdm TR_UPPER = {ord('i'): 'İ'} TR_LOWER = {ord('I'): 'ı'} Vocab = namedtuple('Vocab', ['idx_to_term', 'term_to_idx']) def load_vocab(url): """Load a vocabulary file. url -- string -- url to the txt file returns -- Vocab(idx_to_term=list, term_to_idx=dict) """ term_to_idx = {} idx_to_term = [] with open(url, 'r', encoding='utf-8') as f: for i, line in enumerate(f): word = line.strip() idx_to_term.append(word) term_to_idx[word] = i return Vocab(idx_to_term, term_to_idx) def to_indices(vocab, batch, ndim=2, oov_idx=0, pad_idx=-1): """Convert a nested list of strings to a np.array of integers. vocab -- Vocab -- the vocabulary of the model batch -- [..[str]..] -- multidimensional batch ndim -- int -- number of dimensions in batch oov_idx -- int or None -- if specified, replace missing terms by the given index, otherwise raise an error pad_idx -- int or None -- if specified, pad short last-dimension as specified, otherwise raise an error raises -- ValueError -- if pad is required but pad_idx not specified -- KeyError -- if oov is required but oov_idx not specified returns -- np.array(int) -- term indices """ #print_rank(f'to_indices: batch len: {len(batch)} ndim: {ndim}') if ndim == 1: return np.array( [(vocab.term_to_idx[term] if oov_idx is None else vocab.term_to_idx.get(term, oov_idx)) for term in batch], dtype=np.int32) if ndim == 2: # note: in most circumstances there is only one example in the batch # as a result, padding is never applied. We rely on collate_fn to properly # apply padding. length = max(len(row) for row in batch) if pad_idx is None and min(len(row) for row in batch) != length: raise ValueError('Padding required, but no pad_idx provided') pad = length * [pad_idx] result = np.array( [[(vocab.term_to_idx[term] if oov_idx is None else vocab.term_to_idx.get(term, oov_idx)) for term in row] + pad[len(row):] for row in batch], dtype=np.int32) #print_rank(f'to_indices result: {result.shape}') return result # Flatten to a 2D batch, then recurse & reshape up (this ensures # padding is handled correctly) shape = [len(batch)] for _ in range(2, ndim): shape.append(len(batch[0])) batch = [item for sub_batch in batch for item in sub_batch] shape.append(-1) return to_indices(vocab, batch, ndim=2, oov_idx=oov_idx, pad_idx=pad_idx).reshape(*shape) def case_backoff_batch(batch, vocab): """Perform capitalization backoff on words both to lower & initial-upper case variants. batch -- list(list(string)) -- batch of sentences of words, to back off vocab -- set(string) -- vocabulary to consider returns -- list(list(string)) -- backed-off batch """ def _variants(word): yield word yield word.translate(TR_LOWER).lower() yield word.lower() if len(word) > 1: yield word[0].translate(TR_UPPER).capitalize() + word[1:] yield word.capitalize() return [[next((variant for variant in _variants(word) if variant in vocab), word) # will become OOV for word in sentence] for sentence in batch] def encode_data(data_dict, vocab): '''Encode data that is in the format expected by FLUTE Parameters ---------- data_dict: dict Dictionary where keys consist of usernames and values give the data for that user, specified by another dictionary with keys :code:`x` (features) and, optionally, :code:`y` (labels). vocab: Returns ------- dict Dictionary in the same format as the input one, but now the data in the :code:`x` field is given by tokens (i.e., integers), instead of strings. ''' new_dict = {} for key, value in tqdm(data_dict.items()): user_data = [s.split() for s in value['x']] processed_data = case_backoff_batch(user_data, vocab.term_to_idx) encoded_data = [[vocab.term_to_idx.get(term, 0) for term in row] for row in processed_data] new_dict[key] = {'x': encoded_data} return new_dict if __name__ == '__main__': parser = ArgumentParser(description='Encodes data') parser.add_argument('data_path', type=str, help='Path to data') parser.add_argument('vocab_path', type=str, help='Path to vocabulary') args = parser.parse_args() if not os.path.isfile(args.data_path): raise ValueError('data file does not exist') if not os.path.isfile(args.vocab_path): raise ValueError('vocabulary file does not exist') if args.data_path[-5:] != '.json': raise ValueError('argument must be a valid json file') # Load vocabulary print('Loading vocabulary...') vocab = load_vocab(args.vocab_path) # Load and encode data print('Loading data... ', end='', flush=True) start_time = time.time() with open(args.data_path, 'r') as input_file: all_data = json.load(input_file) print(f'Finished in {time.time() - start_time:.2f}s') print('Converting data...') converted_user_data = encode_data(all_data['user_data'], vocab) # For debug purposes for k, v in converted_user_data.items(): print(f'USER: {k}\nDATA: {v}') break # Save encoded data to disk print('Saving encoded data to disk...') all_data['user_data'] = converted_user_data with open(f'{args.data_path[:-5]}-encoded.json', 'w') as output_file: json.dump(all_data, output_file) ================================================ FILE: experiments/nlp_rnn_fedshakespeare/README.md ================================================ ## FedML Benchmark ### Examples The example in this folder was taken from [FedML](https://github.com/FedML-AI/FedML/tree/master/python/examples/simulation/mpi_fedavg_datasets_and_models_example) repository on its release 0.7.300, using the configuration suggested on their [benchmarking results](https://doc.fedml.ai/simulation/benchmark/BENCHMARK_MPI.html) for MPI-Based Federated Learning (fastest on this version). ### Data FLUTE will automatically download the data used for this example, otherwise you can use the scripts provided [here](https://github.com/FedML-AI/FedML/tree/master/python/fedml/data) for each independent dataset in the FedML GitHub repository. ### Run If you downloaded the data manually, make sure that the variable `data_cache_dir` has been updated inside `preprocess.py`. Later, you can run the experiment as follows: ```code python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest -config ./experiments/nlp_rnn_fedshakespeare/config.yaml -task nlp_rnn_fedshakespeare -backend nccl ``` ### Results This comparison was carried out using Parrot (Simulator) on version 0.7.303 at commit ID [8f7f261f](https://github.com/FedML-AI/FedML/tree/8f7f261f44e58d0cb5a416b0d6fa270b42a91049). ``` _____________________________________________________________________________ | | FedML (MPI) - Fastest | FLUTE (NCCL) - Fastest | | Task | Acc | Time | GPU Mem | Acc | Time | GPU Mem | |--------------------|-----|----------|----------|-----|----------|-----------| | LR_MNIST | ~81 | 00:03:09 | ~3060 MB | ~81 | 00:01:35 | ~1060 MB | | CNN_FEMNIST | ~83 | 05:49:52 | ~5180 MB | ~83 | 00:08:22 | ~1770 MB | | RESNET_FEDCIFAR100 | ~34 | 15:55:36 | ~5530 MB | ~33 | 01:42:01 | ~1900 MB | | RNN_FEDSHAKESPEARE | ~57 | 06:46:21 | ~3690 MB | ~57 | 00:21:50 | ~1270 MB | ----------------------------------------------------------------------------- ``` ### FedML Configuration file In order to reproduce this experiment in FedML please use the setup below. ```yaml common_args: training_type: "simulation" random_seed: 0 data_args: dataset: "fed_shakespeare" data_cache_dir: ~/fedml_data partition_method: "hetero" partition_alpha: 0.5 model_args: model: "rnn" train_args: federated_optimizer: "FedAvg" client_id_list: "[]" client_num_in_total: 715 client_num_per_round: 10 comm_round: 1200 epochs: 1 batch_size: 4 client_optimizer: sgd learning_rate: 0.8 weight_decay: 0.001 validation_args: frequency_of_the_test: 50 device_args: worker_num: 10 using_gpu: true gpu_mapping_file: config/fedshakespeare_rnn/gpu_mapping.yaml gpu_mapping_key: mapping_default # [3, 3, 3, 2] comm_args: backend: "MPI" is_mobile: 0 ``` ================================================ FILE: experiments/nlp_rnn_fedshakespeare/config.yaml ================================================ # Basic configuration file for running classif_cnn example using torchvision CIFAR10 dataset. # Parameters needed to initialize the model model_config: model_type: RNN # class w/ `loss` and `inference` methods model_folder: experiments/nlp_rnn_fedshakespeare/model.py # file containing class # Configuration for differential privacy dp_config: enable_local_dp: false # whether to enable user-level DP # Additional privacy metrics privacy_metrics_config: apply_metrics: false # cache data to compute additional metrics # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx) strategy: FedAvg # Determines all the server-side settings for training and evaluation rounds server_config: wantRL: false # whether to use RL-based meta-optimizers resume_from_checkpoint: false # restart from checkpoint if file exists do_profiling: false # run profiler and compute runtime metrics optimizer_config: # this is the optimizer used to update the model type: sgd lr: 1.0 annealing_config: # annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 50000 # how many iterations between metric eval on val set rec_freq: 50 # how many iterations between metric eval on test set initial_val: false initial_rec: false max_iteration: 1200 # how many iterations in total num_clients_per_iteration: 10 # how many clients per iteration data_config: # where to get val and test data from val: batch_size: 4 val_data: null # Assigned to null because dataset is being instantiated test: batch_size: 4 test_data: null # Assigned to null because dataset is being instantiated type: model_optimization aggregate_median: softmax # how aggregations weights are computed initial_lr_client: 0.8 # learning rate used on client optimizer lr_decay_factor: 1.0 weight_train_loss: train_loss best_model_criterion: loss fall_back_to_best_model: false softmax_beta: 1.0 # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: do_profiling: false # run profiling and compute runtime metrics ignore_subtask: false data_config: # where to get training data from train: batch_size: 4 list_of_train_data: null # Assigned to null because dataset is being instantiated desired_max_samples: 5000 optimizer_config: # this is the optimizer used by the client type: sgd lr: 0.8 # this is overridden by `initial_lr_client` type: optimization ================================================ FILE: experiments/nlp_rnn_fedshakespeare/dataloaders/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch import numpy as np from core.dataloader import BaseDataLoader from experiments.nlp_rnn_fedshakespeare.dataloaders.dataset import Dataset class DataLoader(BaseDataLoader): def __init__(self, mode, num_workers=0, **kwargs): args = kwargs['args'] self.batch_size = args['batch_size'] dataset = Dataset( data=kwargs['data'], test_only=(not mode=='train'), user_idx=kwargs.get('user_idx', None), ) super().__init__( dataset, batch_size=self.batch_size, shuffle=(mode=='train'), num_workers=num_workers, collate_fn=self.collate_fn, ) def collate_fn(self, batch): x, y = list(zip(*batch)) x, y = np.array(x), np.array(y) return {'x': torch.tensor(x), 'y': torch.tensor(y)} ================================================ FILE: experiments/nlp_rnn_fedshakespeare/dataloaders/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np from core.dataset import BaseDataset from experiments.nlp_rnn_fedshakespeare.dataloaders.preprocessing import FEDSHAKESPEARE class Dataset(BaseDataset): def __init__(self, data, test_only=False, user_idx=0, **kwargs): self.test_only = test_only self.user_idx = user_idx # Get all data self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only) if user_idx == -1: self.user = self.user_list self.features = np.vstack([user_data for user_data in self.user_data.values()]) self.labels = np.vstack([user_label for user_label in self.user_data_label.values()]) else: if self.test_only: # combine all data into single array self.user = 'test_only' self.features = np.vstack([user_data for user_data in self.user_data.values()]) self.labels = np.vstack([user_label for user_label in self.user_data_label.values()]) else: # get a single user's data if user_idx is None: raise ValueError('in train mode, user_idx must be specified') self.user = self.user_list[user_idx] self.features = self.user_data[self.user] self.labels = self.user_data_label[self.user] def __getitem__(self, idx): return np.array(self.features[idx]).astype(np.int32).T, self.labels[idx] def __len__(self): return len(self.features) def load_data(self, data, test_only): '''Wrapper method to read/instantiate the dataset''' if data == None: dataset = FEDSHAKESPEARE() data = dataset.testset if test_only else dataset.trainset users = data['users'] features = data['user_data'] labels = data['user_data_label'] num_samples = data['num_samples'] return users, features, labels, num_samples ================================================ FILE: experiments/nlp_rnn_fedshakespeare/dataloaders/preprocessing.py ================================================ import logging import os import wget import tarfile import h5py import collections import numpy as np data_cache_dir = "./data" DEFAULT_TRAIN_FILE = "shakespeare_train.h5" DEFAULT_TEST_FILE = "shakespeare_test.h5" word_dict = None word_list = None _pad = "" _bos = "" _eos = "" ''' The FedeShakespeare dataset is taken from FedML repository. For more information regarding this dataset, please refer to https://github.com/FedML-AI/FedML/tree/master/python/fedml/data/fed_shakespeare. In order to download the data run the following commands: - wget --no-check-certificate --no-proxy https://fedml.s3-us-west-1.amazonaws.com/shakespeare.tar.bz2 - tar -xvf shakespeare.tar.bz2 This code follows the steps of preprocessing in tff shakespeare dataset: https://github.com/google-research/federated/blob/master/utils/datasets/shakespeare_dataset.py ''' SEQUENCE_LENGTH = 80 # from McMahan et al AISTATS 2017 # Vocabulary re-used from the Federated Learning for Text Generation tutorial. # https://www.tensorflow.org/federated/tutorials/federated_learning_for_text_generation CHAR_VOCAB = list("dhlptx@DHLPTX $(,048cgkoswCGKOSW[_#'/37;?bfjnrvzBFJNRVZ\"&*.26:\naeimquyAEIMQUY]!%)-159\r") def preprocess(sentences, max_seq_len=SEQUENCE_LENGTH): sequences = [] def to_ids(sentence, num_oov_buckets=1): """ map list of sentence to list of [idx..] and pad to max_seq_len + 1 Args: num_oov_buckets : The number of out of vocabulary buckets. max_seq_len: Integer determining shape of padded batches. """ tokens = [char_to_id(c) for c in sentence] tokens = [char_to_id(_bos)] + tokens + [char_to_id(_eos)] if len(tokens) % (max_seq_len + 1) != 0: pad_length = (-len(tokens)) % (max_seq_len + 1) tokens += [char_to_id(_pad)] * pad_length return ( tokens[i : i + max_seq_len + 1] for i in range(0, len(tokens), max_seq_len + 1) ) for sen in sentences: sequences.extend(to_ids(sen)) return sequences def char_to_id(char): word_dict = get_word_dict() if char in word_dict: return word_dict[char] else: return len(word_dict) def get_word_dict(): global word_dict if word_dict == None: words = [_pad] + CHAR_VOCAB + [_bos] + [_eos] word_dict = collections.OrderedDict() for i, w in enumerate(words): word_dict[w] = i return word_dict def split(dataset): ds = np.asarray(dataset) x = ds[:, :-1] y = ds[:, 1:] return x, y def download_files(data_cache_dir): URL = "https://fedml.s3-us-west-1.amazonaws.com/shakespeare.tar.bz2" if not os.path.exists(data_cache_dir): os.makedirs(data_cache_dir) file_path = os.path.join(data_cache_dir,"shakespeare.tar.bz2") # Download and decompress the file (if we haven't already) if not os.path.exists(file_path): wget.download(URL, out=file_path) file = tarfile.open(file_path) file.extractall(os.path.join(data_cache_dir,'fed_shakespeare')) file.close() class FEDSHAKESPEARE: def __init__(self) : download_files(data_cache_dir) train_h5 = h5py.File(os.path.join(data_cache_dir,'fed_shakespeare', DEFAULT_TRAIN_FILE), "r") test_h5 = h5py.File(os.path.join(data_cache_dir, 'fed_shakespeare',DEFAULT_TEST_FILE), "r") test_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()} train_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()} for user in train_h5['examples'].keys(): train_dict['users'].append(user) raw_train = train_h5['examples'][user]['snippets'][()] raw_train = [x.decode("utf8") for x in raw_train] user_data = preprocess(raw_train) train_dict['num_samples'].append(len(user_data)) # split data train_x, train_y = split(user_data) train_dict['user_data'][user] = train_x train_dict['user_data_label'][user] = train_y for user in test_h5['examples'].keys(): test_dict['users'].append(user) raw_test = test_h5['examples'][user]['snippets'][()] raw_test = [x.decode("utf8") for x in raw_test] user_data = preprocess(raw_test) test_dict['num_samples'].append(len(user_data)) # split data test_x, test_y = split(user_data) test_dict['user_data'][user] = test_x test_dict['user_data_label'][user] = test_y print(" Dictionaries ready .. ") self.trainset, self.testset = train_dict, test_dict ================================================ FILE: experiments/nlp_rnn_fedshakespeare/model.py ================================================ import torch from torch import nn from torch.nn import functional as F from core.model import BaseModel ''' The CNN_DropOut model is taken from FedML repository. For more information regarding this model, please refer to https://github.com/FedML-AI/FedML/blob/master/python/fedml/model/nlp/rnn.py. ''' class nlp_rnn_fedshakespeare(nn.Module): def __init__(self, embedding_dim=8, vocab_size=90, hidden_size=256): super(nlp_rnn_fedshakespeare, self).__init__() self.embeddings = nn.Embedding( num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0 ) self.lstm = nn.LSTM( input_size=embedding_dim, hidden_size=hidden_size, num_layers=2, batch_first=True, ) self.fc = nn.Linear(hidden_size, vocab_size) def forward(self, input_seq): embeds = self.embeddings(input_seq) # Note that the order of mini-batch is random so there is no hidden relationship among batches. # So we do not input the previous batch's hidden state, # leaving the first hidden state zero `self.lstm(embeds, None)`. lstm_out, _ = self.lstm(embeds) # use the final hidden state as the next character prediction final_hidden_state = lstm_out[:, -1] # output = self.fc(final_hidden_state) # For fed_shakespeare output = self.fc(lstm_out[:, :]) output = torch.transpose(output, 1, 2) return output class RNN(BaseModel): '''This is a PyTorch model with some extra methods''' def __init__(self, model_config): super().__init__() self.net = nlp_rnn_fedshakespeare() def loss(self, input: torch.Tensor) -> torch.Tensor: '''Performs forward step and computes the loss''' device = 'cuda' if torch.cuda.is_available() else 'cpu' x, target = input['x'].to(device), input['y'].to(device) output = self.net.forward(x) criterion = nn.CrossEntropyLoss(ignore_index=0).to(device) return criterion(output, target.long()) def inference(self, input): '''Performs forward step and computes metrics''' device = 'cuda' if torch.cuda.is_available() else 'cpu' x, target = input['x'].to(device), input['y'].to(device) output = self.net.forward(x) n_samples = x.shape[0] pred = torch.argmax(output, dim=1) mask = (target != 0) accuracy = torch.sum((pred[mask] == target[mask]).float()).item() accuracy = accuracy/mask.sum() return {'output':output, 'acc': accuracy, 'batch_size': n_samples} ================================================ FILE: experiments/semisupervision/README.md ================================================ ### Data In order to run this experiment, you need to previously run the script [cifar_dataset.py](dataloaders/cifar_dataset.py) in order to download and preprocess the CIFAR100 dataset needed for this task. ```code python experiments/semisupervision/dataloaders/cifar_dataset.py ``` ### Run Once the data has been downloaded, you can run the experiment as follows: ```code python -m torch.distributed.run --nproc_per_node=2 e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest -config ./experiments/semisupervision/config.yaml -task semisupervision -backend nccl ``` ================================================ FILE: experiments/semisupervision/config.yaml ================================================ # Basic configuration file for running semisupervision with data loaded on-the-fly # Parameters needed to initialize the model model_config: model_type: Res # class w/ `loss` and `inference` methods model_folder: experiments/semisupervision/model.py # file containing class num_classes: 100 # Configuration for differential privacy dp_config: enable_local_dp: false # whether to enable user-level DP # Additional privacy metrics privacy_metrics_config: apply_metrics: false # cache data to compute additional metrics # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx) strategy: FedLabels # Determines all the server-side settings for training and evaluation rounds server_config: send_dicts: true # if true, the server will update model dictionaries instead of grads wantRL: false # whether to use RL-based meta-optimizers resume_from_checkpoint: true # restart from checkpoint if file exists do_profiling: false # run profiler and compute runtime metrics optimizer_config: # this is the optimizer used to update the model type: sgd lr: 1.0 annealing_config: # annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 1 # how many iterations between metric eval on val set rec_freq: 5000 # how many iterations between metric eval on test set initial_val: true initial_rec: false max_iteration: 2000 # how many iterations in total num_clients_per_iteration: 10 # how many clients per iteration data_config: # where to get val and test data from val: batch_size: 64 val_data: null test: batch_size: 64 test_data: null type: model_optimization aggregate_median: softmax # how aggregations weights are computed softmax_beta: 20.0 initial_lr_client: 0.003 # learning rate used on client optimizer lr_decay_factor: 1.0 weight_train_loss: train_loss best_model_criterion: loss fall_back_to_best_model: false # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: do_profiling: false # run profiling and compute runtime metrics ignore_subtask: false data_config: # where to get training data from train: batch_size: 64 list_of_train_data: null desired_max_samples: 87000 optimizer_config: # this is the optimizer used by the client type: sgd lr: 0.003 # this is overridden by `initial_lr_client` momentum: 0 type: optimization semisupervision: uda: 1 num_classes: 100 isclust: 0 alpha: 0.1 train_ratio: 0.2 test_ratio: 0.0 val_ratio: 0.8 vat_ptb: 0 vat_consis: 0.05 lamb_consist: 0.05 unsup_lamb: 1 l2_lambda: 10 burnout_round: 50 thre: 0.3 comp: var eta: 0.003 bs: 64 unl_bs: 128 train_ep: 30 unsuptrain_ep: 10 ensize: 100 seed: 0 temp: 1 device: cuda size: 10 shuffle: 1 ================================================ FILE: experiments/semisupervision/dataloaders/RandAugment.py ================================================ ''' Code in this file is adapted from rpmcruz/autoaugment https://github.com/rpmcruz/autoaugment/blob/master/transformations.py This code is modified version of https://github.com/ildoonet/pytorch-randaugment/blob/master/RandAugment/augmentations.py for randaugmentation. ''' import random import PIL, PIL.ImageOps, PIL.ImageEnhance, PIL.ImageDraw import numpy as np import torch from PIL import Image def ShearX(img, v): # [-0.3, 0.3] assert -0.3 <= v <= 0.3 if random.random() > 0.5: v = -v return img.transform(img.size, PIL.Image.AFFINE, (1, v, 0, 0, 1, 0)) def ShearY(img, v): # [-0.3, 0.3] assert -0.3 <= v <= 0.3 if random.random() > 0.5: v = -v return img.transform(img.size, PIL.Image.AFFINE, (1, 0, 0, v, 1, 0)) def TranslateX(img, v): # [-150, 150] => percentage: [-0.45, 0.45] assert -0.45 <= v <= 0.45 if random.random() > 0.5: v = -v v = v * img.size[0] return img.transform(img.size, PIL.Image.AFFINE, (1, 0, v, 0, 1, 0)) def TranslateXabs(img, v): # [-150, 150] => percentage: [-0.45, 0.45] assert 0 <= v if random.random() > 0.5: v = -v return img.transform(img.size, PIL.Image.AFFINE, (1, 0, v, 0, 1, 0)) def TranslateY(img, v): # [-150, 150] => percentage: [-0.45, 0.45] assert -0.45 <= v <= 0.45 if random.random() > 0.5: v = -v v = v * img.size[1] return img.transform(img.size, PIL.Image.AFFINE, (1, 0, 0, 0, 1, v)) def TranslateYabs(img, v): # [-150, 150] => percentage: [-0.45, 0.45] assert 0 <= v if random.random() > 0.5: v = -v return img.transform(img.size, PIL.Image.AFFINE, (1, 0, 0, 0, 1, v)) def Rotate(img, v): # [-30, 30] assert -30 <= v <= 30 if random.random() > 0.5: v = -v return img.rotate(v) def AutoContrast(img, _): return PIL.ImageOps.autocontrast(img) def Invert(img, _): return PIL.ImageOps.invert(img) def Equalize(img, _): return PIL.ImageOps.equalize(img) def Flip(img, _): # not from the paper return PIL.ImageOps.mirror(img) def Solarize(img, v): # [0, 256] assert 0 <= v <= 256 return PIL.ImageOps.solarize(img, v) def SolarizeAdd(img, addition=0, threshold=128): img_np = np.array(img).astype(np.int) img_np = img_np + addition img_np = np.clip(img_np, 0, 255) img_np = img_np.astype(np.uint8) img = Image.fromarray(img_np) return PIL.ImageOps.solarize(img, threshold) def Posterize(img, v): # [4, 8] v = int(v) v = max(1, v) return PIL.ImageOps.posterize(img, v) def Contrast(img, v): # [0.1,1.9] assert 0.1 <= v <= 1.9 return PIL.ImageEnhance.Contrast(img).enhance(v) def Color(img, v): # [0.1,1.9] assert 0.1 <= v <= 1.9 return PIL.ImageEnhance.Color(img).enhance(v) def Brightness(img, v): # [0.1,1.9] assert 0.1 <= v <= 1.9 return PIL.ImageEnhance.Brightness(img).enhance(v) def Sharpness(img, v): # [0.1,1.9] assert 0.1 <= v <= 1.9 return PIL.ImageEnhance.Sharpness(img).enhance(v) def Cutout(img, v): # [0, 60] => percentage: [0, 0.2] assert 0.0 <= v <= 0.2 if v <= 0.: return img v = v * img.size[0] return CutoutAbs(img, v) def CutoutAbs(img, v): # [0, 60] => percentage: [0, 0.2] # assert 0 <= v <= 20 if v < 0: return img w, h = img.size x0 = np.random.uniform(w) y0 = np.random.uniform(h) x0 = int(max(0, x0 - v / 2.)) y0 = int(max(0, y0 - v / 2.)) x1 = min(w, x0 + v) y1 = min(h, y0 + v) xy = (x0, y0, x1, y1) color = (125, 123, 114) # color = (0, 0, 0) img = img.copy() #print(img) PIL.ImageDraw.Draw(img).rectangle(xy, color) return img def SamplePairing(imgs): # [0, 0.4] def f(img1, v): i = np.random.choice(len(imgs)) img2 = PIL.Image.fromarray(imgs[i]) return PIL.Image.blend(img1, img2, v) return f def Identity(img, v): return img def augment_list(grey): # 16 oeprations and their ranges # https://github.com/google-research/uda/blob/master/image/randaugment/policies.py#L57 # l = [ # (Identity, 0., 1.0), # (ShearX, 0., 0.3), # 0 # (ShearY, 0., 0.3), # 1 # (TranslateX, 0., 0.33), # 2 # (TranslateY, 0., 0.33), # 3 # (Rotate, 0, 30), # 4 # (AutoContrast, 0, 1), # 5 # (Invert, 0, 1), # 6 # (Equalize, 0, 1), # 7 # (Solarize, 0, 110), # 8 # (Posterize, 4, 8), # 9 # # (Contrast, 0.1, 1.9), # 10 # (Color, 0.1, 1.9), # 11 # (Brightness, 0.1, 1.9), # 12 # (Sharpness, 0.1, 1.9), # 13 # # (Cutout, 0, 0.2), # 14 # # (SamplePairing(imgs), 0, 0.4), # 15 # ] if grey: # https://github.com/tensorflow/tpu/blob/8462d083dd89489a79e3200bcc8d4063bf362186/models/official/efficientnet/autoaugment.py#L505 l = [ (AutoContrast, 0, 1), (Equalize, 0, 1), (Invert, 0, 1), (Rotate, 0, 30), (Posterize, 0, 4), (Solarize, 0, 256), (SolarizeAdd, 0, 110), (Color, 0.1, 1.9), (Contrast, 0.1, 1.9), (Brightness, 0.1, 1.9), (Sharpness, 0.1, 1.9), (ShearX, 0., 0.3), (ShearY, 0., 0.3), (TranslateXabs, 0., 100), (TranslateYabs, 0., 100), ] else: l = [ (AutoContrast, 0, 1), (Equalize, 0, 1), (Invert, 0, 1), (Rotate, 0, 30), (Posterize, 0, 4), (Solarize, 0, 256), (SolarizeAdd, 0, 110), (Color, 0.1, 1.9), (Contrast, 0.1, 1.9), (Brightness, 0.1, 1.9), (Sharpness, 0.1, 1.9), (ShearX, 0., 0.3), (ShearY, 0., 0.3), (CutoutAbs, 0, 40), (TranslateXabs, 0., 100), (TranslateYabs, 0., 100), ] return l class Lighting(object): """Lighting noise(AlexNet - style PCA - based noise)""" def __init__(self, alphastd, eigval, eigvec): self.alphastd = alphastd self.eigval = torch.Tensor(eigval) self.eigvec = torch.Tensor(eigvec) def __call__(self, img): if self.alphastd == 0: return img alpha = img.new().resize_(3).normal_(0, self.alphastd) rgb = self.eigvec.type_as(img).clone() \ .mul(alpha.view(1, 3).expand(3, 3)) \ .mul(self.eigval.view(1, 3).expand(3, 3)) \ .sum(1).squeeze() return img.add(rgb.view(3, 1, 1).expand_as(img)) class CutoutDefault(object): """ Reference : https://github.com/quark0/darts/blob/master/cnn/utils.py """ def __init__(self, length): self.length = length def __call__(self, img): h, w = img.size(1), img.size(2) mask = np.ones((h, w), np.float32) y = np.random.randint(h) x = np.random.randint(w) y1 = np.clip(y - self.length // 2, 0, h) y2 = np.clip(y + self.length // 2, 0, h) x1 = np.clip(x - self.length // 2, 0, w) x2 = np.clip(x + self.length // 2, 0, w) mask[y1: y2, x1: x2] = 0. mask = torch.from_numpy(mask) mask = mask.expand_as(img) img *= mask return img class RandAugment: def __init__(self, n, m, grey=False): self.n = n self.m = m # [0, 30] self.augment_list = augment_list(grey) def __call__(self, img): ops = random.choices(self.augment_list, k=self.n) #print(ops) for op, minval, maxval in ops: val = (float(self.m) / 30) * float(maxval - minval) + minval img = op(img, val) return img ================================================ FILE: experiments/semisupervision/dataloaders/cifar_dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import os import time import json import torch import numpy as np import pathlib from torchvision import datasets, transforms from torch.utils.data import TensorDataset, DataLoader from numpy.random import RandomState TRAINSET = "trainset.json" TRAINSET_UNLAB = "trainset_unlab.json" TRAINSET_UNLAB_RAND = "trainset_unlab_rand.json" TESTSET = "testset.json" ROOT = './data' class CIFAR100: def __init__(self, user_idx=None, test_only=None, args=None, read_data=True) : if read_data: # Reads the data previously saved on files if user_idx == -1: if test_only: print("Reading testing file") file = os.path.join(ROOT,TESTSET) else: print("Reading training labeled file") file = os.path.join(ROOT,TRAINSET) elif user_idx == -2: print("Reading unlabeled training file") file = os.path.join(ROOT, TRAINSET_UNLAB) elif user_idx == -3: print("Reading unlabeled random training file") file = os.path.join(ROOT, TRAINSET_UNLAB_RAND) with open(file, 'r') as f: json_file = json.load(f) self.data = json_file else: # Create, preprocess and save the datasets from RandAugment import RandAugment trans = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) transform_unlabeltrain = transforms.Compose([ RandAugment(1, 10), transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))]) # Download and preprocess datasets trainset = datasets.CIFAR100('./data', train=True, download=True, transform=transform_train) unlabel_trainset = datasets.CIFAR100('./data', train=True, download=True, transform=transform_unlabeltrain) self.pretestset = datasets.CIFAR100('./data', train=False, download=True, transform=trans) train_loader = DataLoader(trainset, batch_size=len(trainset)) ultrain_loader = DataLoader(unlabel_trainset, batch_size=len(unlabel_trainset)) X_train = next(iter(train_loader))[0].numpy() Y_train = next(iter(train_loader))[1].numpy() X_unlabel_train = next(iter(ultrain_loader))[0].numpy() Y_unlabel_train = next(iter(ultrain_loader))[1].numpy() self.pretrainset, trainset_unlab_rand, trainset_unlab, \ self.embed_dim = partition_imagedataset(X_train, Y_train, X_unlabel_train, Y_unlabel_train,args) self.trainset = _process(self.pretrainset, train=True) self.trainset_unlab = _process(trainset_unlab, train=True) self.trainset_unlab_rand = _process(trainset_unlab_rand, train=True) self.testset = _process(self.pretestset, train=False) save_json(self.trainset, TRAINSET) save_json(self.trainset_unlab, TRAINSET_UNLAB) save_json(self.trainset_unlab_rand, TRAINSET_UNLAB_RAND) save_json(self.testset, TESTSET) def save_json(dict, filename): f = open(os.path.join('./data',filename), "w") json.dump(dict,f) f.close() def _process(dataset, train=True): '''Process a Torchvision/preprocessed dataset to expected FLUTE format''' print('Converting data to expected format...') start_time = time.time() data_dict = {'users':[], 'num_samples': [], 'user_data':{}, 'user_data_label':{}} for i in range(len(dataset)): if train: x, y = dataset[i]['x'], dataset[i]['y'] else: x, y = dataset[i] data_dict['users'].append(f'{i:04d}') data_dict['num_samples'].append(len(y) if train else 1) data_dict['user_data'][f'{i:04d}'] = [xi.tolist() for xi in x] if train else [x.tolist()] data_dict['user_data_label'][f'{i:04d}'] = [yi.tolist() for yi in y] if train else y print(f'Finished converting data in {time.time() - start_time:.2f}s.') return data_dict def partition_imagedataset(X_train, Y_train, X_unlabel_train, Y_unlabel_train, args): if args['isclust'] == 1: partition = __getClusteredData__(Y_train, args['ensize']) elif args['isclust'] == 2: partition = __getClusteredMixedData__(Y_train, args['ensize']) else: partition = __getDirichletData__(Y_train, args) dataset_train = [] dataset_val = [] dataset_val_norand = [] dataset_test = [] train_ratio = args['train_ratio'] val_ratio = args['val_ratio'] test_ratio = args['test_ratio'] x_for_embed = np.shape(X_train[0]) for (i, ind) in enumerate(partition): x = X_train[ind] y = Y_train[ind] x_ul = X_unlabel_train[ind] y_ul = Y_unlabel_train[ind] n_i = len(ind) train_size = int(train_ratio * n_i) val_size = int(val_ratio * n_i) test_size = int(test_ratio * n_i) x_train = torch.Tensor(x[val_size:val_size + train_size]) y_train = torch.LongTensor(y[val_size:val_size + train_size]) dataset_train_torch = {'x': x_train, 'y':y_train} if val_size == 0: x_val = x_train y_cal = y_train dataset_val_torch = dataset_train_torch dataset_val_torch_norand = dataset_train_torch else: x_val = torch.Tensor(x[:val_size]) y_val = torch.LongTensor(y[:val_size]) x_ul_val = torch.Tensor(x_ul[:val_size]) y_ul_val = torch.LongTensor(y_ul[:val_size]) dataset_val_torch = {'x': x_ul_val, 'y': y_ul_val} dataset_val_torch_norand = {'x':x_val, 'y':y_val} dataset_train.append(dataset_train_torch) dataset_val.append(dataset_val_torch) dataset_val_norand.append(dataset_val_torch_norand) return dataset_train, dataset_val, dataset_val_norand, x_for_embed def __getDirichletData__(y, args): n = args['ensize'] n_nets = args['ensize'] K = args['num_classes'] num_c = args['num_classes'] labelList_true = y min_size = 0 N = len(labelList_true) rnd = 0 rann = RandomState(rnd) net_dataidx_map = {} p_client = np.zeros((n, num_c)) for i in range(n): p_client[i] = rann.dirichlet(np.repeat(args['alpha'], num_c)) idx_batch = [[] for _ in range(n_nets)] for k in range(K): idx_k = np.where(labelList_true == k)[0] rann.shuffle(idx_k) proportions = p_client[:, k] proportions = proportions / proportions.sum() proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1] idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))] for j in range(n_nets): if args['shuffle'] == 1: rann.shuffle(idx_batch[j]) net_dataidx_map[j] = idx_batch[j] net_cls_counts_label = {} net_cls_counts_unlabel = {} for net_i in range(len(idx_batch)): n_i = len(idx_batch[net_i]) train_size = int(args['train_ratio'] * n_i) val_size = int(args['val_ratio'] * n_i) unq, unq_cnt = np.unique(labelList_true[idx_batch[net_i][val_size:val_size + train_size]], return_counts=True) tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))} net_cls_counts_label[net_i] = tmp unq1, unq_cnt1 = np.unique(labelList_true[idx_batch[net_i][:val_size]], return_counts=True) tmp1 = {unq1[i]: unq_cnt1[i] for i in range(len(unq1))} net_cls_counts_unlabel[net_i] = tmp1 local_sizes = [] for i in range(n_nets): local_sizes.append(len(net_dataidx_map[i])) local_sizes = np.array(local_sizes) weights = local_sizes / np.sum(local_sizes) return idx_batch if __name__ == "__main__": # Download and preprocess data args= {'name': 'FedVATnew', 'isaml':0, 'uda':1 , 'dataset': 'cifar100', 'num_classes': 100, 'isclust': 0, 'alpha': 0.1, 'train_ratio': 0.2, 'val_ratio':0.8, 'shuffle':1, 'vat_ptb':0.0 , 'vat_consis':0.05, 'unsup_lamb':1, 'l2_lambda':10, 'bo': 50, 'thre': 0.3, 'comp': 'var', 'eta': 0.003, 'bs':64, 'unl_bs':128, 'train_ep':30, 'unsuptrain_ep':10, 'rounds':2000, 'ensize':100, 'size': 10, 'model': 'RES50', 'seed': 0, 'test_ratio': 0.0} data = CIFAR100(read_data=False, args=args) ================================================ FILE: experiments/semisupervision/dataloaders/dataloader.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch import numpy as np from core.dataloader import BaseDataLoader from experiments.semisupervision.dataloaders.dataset import Dataset class DataLoader(BaseDataLoader): def __init__(self, mode, num_workers=0, **kwargs): args = kwargs['args'] self.batch_size = args['batch_size'] dataset = Dataset( data=kwargs['data'], test_only=(not mode=='train'), user_idx=kwargs.get('user_idx', None), ) super().__init__( dataset, batch_size=self.batch_size, shuffle=(mode=='train'), num_workers=num_workers, collate_fn=self.collate_fn, ) def collate_fn(self, batch): x, y = list(zip(*batch)) x = np.array(x) y = np.array(y) return {'x': torch.tensor(x), 'y': torch.tensor(y)} ================================================ FILE: experiments/semisupervision/dataloaders/dataset.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np from core.dataset import BaseDataset from experiments.semisupervision.dataloaders.cifar_dataset import CIFAR100 class Dataset(BaseDataset): def __init__(self, data, test_only=False, user_idx=0, **kwargs): self.test_only = test_only self.user_idx = user_idx args = kwargs.get('args',None) # Get all data self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only, args) if user_idx != -1: if self.test_only: # combine all data into single array self.user = 'test_only' self.features = np.vstack([user_data for user_data in self.user_data.values()]) self.labels = np.hstack([user_label for user_label in self.user_data_label.values()]) else: # get a single user's data if user_idx is None: raise ValueError('in train mode, user_idx must be specified') self.user = self.user_list[user_idx] self.features = self.user_data[self.user] self.labels = self.user_data_label[self.user] def __getitem__(self, idx): return np.array(self.features[idx]).astype(np.float32), self.labels[idx] def __len__(self): return len(self.features) def load_data(self, data, test_only, sup_config): '''Wrapper method to read/instantiate the dataset''' if data == None: dataset = CIFAR100(self.user_idx, test_only, sup_config) data = dataset.data users = data['users'] features = data['user_data'] labels = data['user_data_label'] num_samples = data['num_samples'] return users, features, labels, num_samples ================================================ FILE: experiments/semisupervision/model.py ================================================ import math import torch import torch.nn as nn import torch.nn.functional as F import numpy as np from core.model import BaseModel '''ResNet in PyTorch. Reference: [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun Deep Residual Learning for Image Recognition. arXiv:1512.03385 ''' class BasicBlock(nn.Module): expansion = 1 def __init__(self, in_planes, planes, stride=1): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d( in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion*planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(self.expansion*planes) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) out += self.shortcut(x) out = F.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, in_planes, planes, stride=1): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(self.expansion*planes) self.shortcut = nn.Sequential() if stride != 1 or in_planes != self.expansion*planes: self.shortcut = nn.Sequential( nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(self.expansion*planes) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = F.relu(self.bn2(self.conv2(out))) out = self.bn3(self.conv3(out)) out += self.shortcut(x) out = F.relu(out) return out class ResNet(nn.Module): def __init__(self, block, num_blocks, num_classes=10, inchannels = 3): super(ResNet, self).__init__() self.in_planes = 64 self.conv1 = nn.Conv2d(inchannels, 64, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) self.linear = nn.Linear(512*block.expansion, num_classes) def _make_layer(self, block, planes, num_blocks, stride): strides = [stride] + [1]*(num_blocks-1) layers = [] for stride in strides: layers.append(block(self.in_planes, planes, stride)) self.in_planes = planes * block.expansion return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = F.avg_pool2d(out, 4) out = out.view(out.size(0), -1) out = self.linear(out) return out def ResNet18(num_classes=10): return ResNet(BasicBlock, [2, 2, 2, 2], num_classes) def ResNet18_emnist(num_classes=62, inchannel = 1): return ResNet(BasicBlock, [2, 2, 2, 2], num_classes, inchannel) def ResNet18_organ(num_classes=11, inchannel = 1): return ResNet(BasicBlock, [2, 2, 2, 2], num_classes, inchannel) def ResNet18_path(num_classes=9, inchannel = 3): return ResNet(BasicBlock, [2, 2, 2, 2], num_classes, inchannel) def ResNet18_blood(num_classes=8, inchannel = 3): return ResNet(BasicBlock, [2, 2, 2, 2], num_classes, inchannel) def ResNet34(num_classes=10): return ResNet(BasicBlock, [3, 4, 6, 3], num_classes) def ResNet50(num_classes=10): return ResNet(Bottleneck, [3, 4, 6, 3], num_classes) def ResNet101(num_classes=10): return ResNet(Bottleneck, [3, 4, 23, 3], num_classes) def ResNet152(num_classes=10): return ResNet(Bottleneck, [3, 8, 36, 3], num_classes) def test(): net = ResNet18() y = net(torch.randn(1, 3, 32, 32)) print(y.size()) class Res(BaseModel): '''This is a PyTorch model with some extra methods''' def __init__(self, model_config): super().__init__() self.net = ResNet50(num_classes=model_config['num_classes']) def forward(self,x): return self.net.forward(x) def loss(self, input: torch.Tensor) -> torch.Tensor: '''Performs forward step and computes the loss''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) log_probs = self.net.forward(features) if not self.net.training: # For evaluation loss = F.cross_entropy(log_probs, labels, reduction='sum') loss /= labels.size(0) else: loss_func = torch.nn.CrossEntropyLoss() loss = loss_func(log_probs, labels) return loss def inference(self, input): '''Performs forward step and computes metrics''' device = 'cuda' if torch.cuda.is_available() else 'cpu' features, labels = input['x'].to(device), input['y'].to(device) Softmax = torch.nn.LogSoftmax(dim=1) if len(np.shape(labels)) == 0: labels = torch.stack([labels]) output = self.net.forward(features) log_probs = Softmax(output) _, predicted = log_probs.max(1) accuracy = predicted.eq(labels).sum().item() * 100 n_samples = labels.size(0) return {'output':output, 'acc': accuracy/n_samples, 'batch_size': n_samples} ================================================ FILE: extensions/RL/RL.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import logging import os import json import random import torch import torch.nn as nn import numpy as np from collections import OrderedDict from utils import ( make_lr_scheduler, print_rank, torch_save, try_except_save, make_optimizer, to_device) class SequenceWise(nn.Module): def __init__(self, module): """ Collapses input of dim T*N*H to (T*N)*H, and applies to a module. Allows handling of variable sequence lengths and minibatch sizes. :param module: Module to apply input to. """ super(SequenceWise, self).__init__() self.module = module def forward(self, x): t, n = x.size(0), x.size(1) x = x.view(t * n, -1) x = x.contiguous() x = self.module(x) x = x.view(t, n, -1) return x def __repr__(self): tmpstr = self.__class__.__name__ + ' (\n' tmpstr += self.module.__repr__() tmpstr += ')' return tmpstr class BatchRNN(nn.Module): def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True,dropout=0.0,multi=1): super(BatchRNN, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.batch_norm_activate = batch_norm self.bidirectional = bidirectional self.multi = multi self.dropout = dropout if self.batch_norm_activate: self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) self.rnn = rnn_type(input_size = input_size, hidden_size = hidden_size, bidirectional= bidirectional, bias = True, batch_first = True, dropout = self.dropout) self.num_directions = 2 if bidirectional else 1 def forward(self, x): if x.dim()==2: x=x.unsqueeze(1) if self.batch_norm_activate: x = x.contiguous() x = self.batch_norm(x) x, _ = self.rnn(x) if self.bidirectional and self.multi<2: x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1) return x class NeuralNetwork(nn.Module): def __init__(self, params, wantLSTM=False, batch_norm=False): super(NeuralNetwork, self).__init__() """ The following parameters need revisiting self.number_of_actions = 2 self.gamma = 0.99 self.final_epsilon = 0.0001 self.initial_epsilon = 0.1 self.number_of_iterations = 2000000 self.replay_memory_size = 10000 self.minibatch_size = 32 optimizer = optim.Adam(model.parameters(), lr=1e-6) criterion = nn.MSELoss() """ self.wantLSTM = wantLSTM self.batch_norm= batch_norm params = [int(x) for x in params.split(',')] layers = [] self.softmax = nn.Softmax(dim = 1) if self.wantLSTM: # Recurrent Component of the architecture rnns = [] for i in range(1, len(params) - 2): multi = 1 if i==1 else 1 rnn = BatchRNN(input_size = params[i-1]*multi, hidden_size = params[i], rnn_type = nn.LSTM, bidirectional= True, batch_norm = batch_norm, multi = 1, dropout = 0.0) rnns.append(('%d' %(i-1), rnn)) self.rnn = nn.Sequential(OrderedDict(rnns)) layers.append(nn.Linear(params[-3], params[-2], bias=True)) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Linear(params[-2], params[-1], bias=True)) mlp = nn.Sequential(*layers) self.mlp = nn.Sequential(SequenceWise(mlp),) else: if self.batch_norm: self.batch_norm = nn.BatchNorm1d(params[0]) for i in range(1, len(params)-1): layers.append(nn.Linear(params[i-1], params[i], bias=True)) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Linear(params[-2], params[-1], bias=True)) self.mlp = nn.Sequential(*layers) def forward(self, x): if self.wantLSTM: x = self.rnn(x) if self.batch_norm: x = self.batch_norm(x) out = self.mlp(x) out = out.squeeze() return out class RL: def __init__(self, config=None): # Finalized config-file self.config= config self.out_size = config["num_clients_per_iteration"] self.wantLSTM = config['RL']['wantLSTM'] if 'wantLSTM' in config['RL'] else False self.replay_memory= [] self.state_memory = [] self.epsilon= config['RL']['initial_epsilon'] self.step =0 self.runningLoss =0 model_descriptor = config['RL']['model_descriptor_RL'] if 'model_descriptor_RL' in config['RL'] else 'Default' self.model_name = os.path.join(config['RL']['RL_path'], 'rl_{}.{}.model'.format(self.out_size, model_descriptor)) self.stats_name = os.path.join(config['RL']['RL_path'], 'rl_{}.{}.stats'.format(self.out_size, model_descriptor)) # Initialize RL model self.make_model() self.load_saved_status() # Set the RL weights self.rl_weights=None self.rl_losses=None self.criterion = nn.MSELoss() def set_losses(self, losses): self.rl_losses=losses def set_weights(self, weights): self.rl_weights = weights def forward(self, state=None): # epsilon greedy exploration if self.wantLSTM: N = len(state) state.resize(1, N) if len(self.state_memory)==0: self.state_memory = np.zeros((self.config['RL']['minibatch_size'], N)) self.state_memory = np.concatenate((self.state_memory[1:], state), axis=0) state = self.state_memory if random.random() <= self.epsilon: print_rank("Performed random action!") action= to_device(torch.rand(self.out_size)) else: state = to_device(torch.from_numpy(state)) print_rank(f'RL_state: {state.shape}') action= self.model(state.float()) return action def train(self, batch=None): # save transition to replay memory self.replay_memory.append(batch) # if replay memory is full, remove the oldest transition if len(self.replay_memory) > self.config['RL']['max_replay_memory_size']: self.replay_memory.pop(0) # epsilon annealing self.epsilon *= self.config['RL']['epsilon_gamma'] if self.epsilon*self.config['RL']['epsilon_gamma']>self.config['RL']['final_epsilon'] else 1.0 # sample random minibatch if self.wantLSTM: if len(self.replay_memory)>= self.config['RL']['minibatch_size']: minibatch = self.replay_memory[-self.config['RL']['minibatch_size']:] else: minibatch = self.replay_memory else: minibatch = random.sample(self.replay_memory, min(len(self.replay_memory), self.config['RL']['minibatch_size'])) # unpack minibatch state_batch = torch.tensor(tuple(d[0] for d in minibatch)).float() action_batch = torch.tensor(tuple(d[1] for d in minibatch)).float() reward_batch = torch.tensor(tuple(d[2] for d in minibatch)).float() state_batch = to_device(state_batch) action_batch = to_device(action_batch) reward_batch = to_device(reward_batch) # set y_j to r_j for terminal state, otherwise to r_j + gamma*max(Q) y_batch = reward_batch # extract Q-value print_rank(f'RL state_batch: {state_batch.shape}', loglevel=logging.DEBUG) state_output = self.model(state_batch) print_rank(f'RL train shapes: {state_batch.shape} {action_batch.shape} {state_output.shape}', loglevel=logging.DEBUG) q_value = torch.sum(state_output * action_batch, dim=1) # reset gradient self.optimizer.zero_grad() # returns a new Tensor, detached from the current graph, the result will never require gradient y_batch = y_batch.detach() # calculate loss loss = self.criterion(q_value, y_batch) # do backward pass loss.backward() self.optimizer.step() # Tracking a running average of loss if self.runningLoss==0: self.runningLoss = loss.item() else: self.runningLoss = 0.95 * self.runningLoss + 0.05 * loss.item() print_rank('Running Loss for RL training process: {}'.format(self.runningLoss)) # Decay learning rate self.lr_scheduler.step() def make_model(self): # make model self.model = NeuralNetwork(self.config['RL']['network_params'], \ self.config['RL']['wantLSTM'] if 'wantLSTM' in self.config['RL'] else False, \ self.config['RL']['batchNorm'] if 'batchNorm' in self.config['RL'] else False) print(self.model) model = to_device(model) # make optimizer self.optimizer = make_optimizer(self.config['RL']["optimizer_config"], self.model) # make lr_scheduler self.lr_scheduler = make_lr_scheduler( self.config['RL']['annealing_config'], self.optimizer, num_batches=1) def load_saved_status(self): if os.path.exists(self.model_name): print_rank("Resuming from checkpoint model {}".format(self.model_name)) self.load() if os.path.exists(self.stats_name): with open(self.stats_name, 'r') as logfp: # loading the iteration no., val_loss and lr_weight elems = json.load(logfp) self.cur_iter_no= elems["i"] self.val_loss = elems["val_loss"] self.val_cer = elems["val_cer"] self.runningLoss= elems["weight"] def load(self): print_rank("Loading checkpoint: {}".format(self.model_name)) checkpoint = torch.load(self.model_name) self.model.load_state_dict(checkpoint['model_state_dict']) if self.optimizer is not None: self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) anl_st_dict = checkpoint.get('lr_scheduler_state_dict') if anl_st_dict and self.lr_scheduler is not None: self.lr_scheduler.load_state_dict(anl_st_dict) def save(self, i): """ Save a model as well as training information """ save_state = { 'model_state_dict' : self.model.state_dict(), 'optimizer_state_dict' : self.optimizer.state_dict() if self.optimizer is not None else None, 'lr_scheduler_state_dict' : self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None } outputdir = os.path.dirname(self.model_name) if os.path.exists(outputdir) is False: os.makedirs(outputdir, exist_ok=True) print_rank("Saving model to: {}".format(self.model_name)) try_except_save(torch_save, state_or_model=save_state, save_path=self.model_name) # logging the latest best values print_rank(f'Saving stats to {self.stats_name}') with open(self.stats_name, 'w') as logfp: json.dump({"i":i+1, "val_loss":float(self.rl_losses[0]), "val_cer":float(self.rl_losses[1]), "weight":float(self.runningLoss)}, logfp) ================================================ FILE: extensions/__init__.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from extensions.RL.RL import * from extensions.quantization.quant import * ================================================ FILE: extensions/privacy/__init__.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np import torch as T import logging import math import json from utils import print_rank from azureml.core import Run from scipy.special import betainc, betaln run = Run.get_context() def compute_LDP_noise_std(eps, max_sensitivity, delta): return np.sqrt(2 * np.log(1.25 / delta)) * max_sensitivity / eps def _beta2betainc_ratio(a, x): return 1 / betainc(a, a, x) def _log_m1(d, alpha, gamma): return alpha * np.log(1 - gamma**2) - (d - 2) * np.log(2) - np.log(d - 1) def _log_m2(p, tau, alpha): return np.log(p / (_beta2betainc_ratio(alpha, tau) - 1) - (1 - p)) + np.log(_beta2betainc_ratio(alpha, tau)) - betaln(alpha, alpha) def _efficient_m(d, gamma, p): alpha = (d - 1) / 2 tau = (1 + gamma) / 2 return np.exp(_log_m1(d, alpha, gamma) + _log_m2(p, tau, alpha)) def privacy_parameters(eps0, eps, d): exp_eps0 = np.exp(eps0) exp_eps = np.exp(eps) if exp_eps0 == np.inf: p0 = 1 else: p0 = exp_eps0 / (1 + exp_eps0) if exp_eps == np.inf: gamma = np.sqrt(np.pi / (2 * (d - 1))) else: gamma = ((exp_eps - 1) / (exp_eps + 1)) * np.sqrt(np.pi / (2 * (d - 1))) return p0, gamma def private_unit2(grad, gamma, prob): np.testing.assert_almost_equal(grad.norm().cpu().item(), 1, decimal=5) assert prob >= 0.5 assert (0 <= gamma <= 1) p = T.rand(()) while True: # create a uniform distriubtion over d-sphere V = T.normal(0, 1, grad.shape, device=grad.device) V = V / V.norm() dot_prod = T.dot(V, grad) if (dot_prod >= gamma and p < prob) or (dot_prod < gamma and p >= prob): break d = grad.shape[0] m = _efficient_m(d, gamma, prob) return V / m def add_gaussian_noise(grad, eps, max_grad, delta): sigma = compute_LDP_noise_std(eps, max_grad, delta) #sigma = np.sqrt(2 * np.log(1.25 / delta)) * max_grad / eps noisy_grad = sigma * T.randn(grad.shape, device=grad.device) + grad return noisy_grad, sigma def add_private_unit2_noise(eps, grad): eps0 = 0.01 * eps eps1 = 0.99 * eps samp_prob, gamma = privacy_parameters(eps0, eps1, grad.shape[0]) return private_unit2(grad, gamma, samp_prob) def scalar_DP(r, eps, k, r_max): r = np.minimum(r, r_max) val = k * r / r_max f_val = math.floor(val) c_val = math.ceil(val) J = f_val if T.rand(()) < (c_val - val) else c_val exp_eps = np.exp(eps) rand_prob = exp_eps / (exp_eps + k) if T.rand(()) >= rand_prob: while True: J_ = T.randint(0, k + 1, ()).item() if J != J_: J = J_ break a = ((exp_eps + k) / (exp_eps - 1)) * (r_max / k) b = (k * (k + 1)) / (2 * (exp_eps + k)) return a * (J - b) def laplace_noise(max_sens, eps, vocab_size): return np.random.laplace(0.0, max_sens/eps, vocab_size) def unroll_network(named_params, select_grad=False): # Unroll the network as 1D vector and save original values indices params_ids, flat_params = {}, [] cur_idx = 0 for n, p in named_params: dat = p.grad if select_grad else p.data flat_params.append(dat.view(-1)) next_idx = cur_idx + flat_params[-1].shape[0] params_ids[n] = (cur_idx, next_idx) cur_idx = next_idx return T.cat(flat_params), params_ids def update_network(named_params, params_ids, flat_params, apply_to_grad=False): # Roll back the network parameters to layers for n, p in named_params: s_id, e_id = params_ids[n] if apply_to_grad: p.grad.copy_(flat_params[s_id : e_id].view(*p.grad.shape)) else: p.data.copy_(flat_params[s_id : e_id].view(*p.data.shape)) def apply_global_dp(config, model, num_clients_curr_iter, select_grad=True, metric_logger=None): # Add global DP noise here dp_config = config.get('dp_config', None) if dp_config is not None and dp_config.get('enable_global_dp', False): # enable_local_dp must be enabled - client-side gradient clipping must be enabled. assert (dp_config['enable_local_dp']) # Unroll the network grads as 1D vectors flat_grad, params_ids = unroll_network(model.named_parameters(), select_grad=select_grad) sigma = dp_config['global_sigma'] max_grad = dp_config['max_grad'] noise_scale = sigma * max_grad / num_clients_curr_iter noise = T.normal(0, 1, flat_grad.shape, device=flat_grad.device) * noise_scale flat_noisy_grad = flat_grad + noise print_rank('Error from noise {} is {}. grad norm: {} noisy_grad norm: {}'.format(noise_scale, ( flat_grad - flat_noisy_grad).norm(), flat_grad.norm(), flat_noisy_grad.norm())) # Return back to the network gradients update_network(model.named_parameters(), params_ids, flat_noisy_grad, apply_to_grad=select_grad) if metric_logger is None: metric_logger = Run.get_context().log metric_logger('Gradient Norm', flat_grad.norm().cpu().item()) def apply_local_dp(trainer, weight, dp_config, add_weight_noise): '''Apply client-side DP, possibly given a data-dependent aggregation weight Args: trainer (core.Trainer object): trainer on client. dp_config (dict): DP config on original config file. add_weight_noise (bool): whether noise should be added to aggregation weight. ''' # Unroll the network grads as 1D vectors flat_grad, params_ids = unroll_network(trainer.model.named_parameters(), select_grad=True) grad_norm = flat_grad.norm().cpu().item() if dp_config['eps'] < 0: # clip, but don't add noise if grad_norm > dp_config['max_grad']: flat_grad = flat_grad * (dp_config['max_grad'] / grad_norm) update_network(trainer.model.named_parameters(), params_ids, flat_grad, apply_to_grad=True) else: # Get Gaussian LDP noise dp_eps = dp_config['eps'] delta = dp_config.get('delta', 1e-7) # TODO pre-compute in config weight_ = weight # Scaling the weight down so we don't impact the noise too much weight = dp_config.get('weight_scaler', 1) * weight weight = min(dp_config['max_weight'], weight) flat_noisy_grad = dp_config['max_grad'] * (flat_grad / flat_grad.norm()) max_sensitivity = np.sqrt(dp_config['max_grad']**2 + (dp_config['max_weight']**2 if add_weight_noise else 0.0)) flat_noisy_grad = T.cat([flat_noisy_grad, T.tensor([weight], device=flat_noisy_grad.device)], dim=0) flat_noisy_grad, _ = add_gaussian_noise(flat_noisy_grad, dp_eps, max_sensitivity, delta) weight = min(max(flat_noisy_grad[-1].item(), dp_config['min_weight']), dp_config['max_weight']) # Scaling the weight back up after noise addition (This is a DP-protect transformation) weight = weight / dp_config.get('weight_scaler', 1) if not add_weight_noise: weight = weight_ flat_noisy_grad = flat_noisy_grad[:-1] print_rank('Cosine error from noise {}'.format(T.nn.functional.cosine_similarity(flat_grad, flat_noisy_grad, dim=0)), loglevel=logging.DEBUG) print_rank('Error from noise is {}'.format((flat_grad-flat_noisy_grad).norm()), loglevel=logging.DEBUG) print_rank('weight is {} and noisy weight is {}'.format(weight_, weight), loglevel=logging.DEBUG) # Return back to the network update_network(trainer.model.named_parameters(), params_ids, flat_noisy_grad, apply_to_grad=True) return weight def update_privacy_accountant(config, num_clients, curr_iter, num_clients_curr_iter): # Privacy accounting starts here # We will dump all the needed parameters to the log so as not to slow down training. dp_config = config.get('dp_config', None) if dp_config is not None and dp_config.get('enable_global_dp', False) or dp_config.get('enable_local_dp', False): from math import sqrt, exp, log import extensions.privacy.analysis as privacy_analysis K = 1 # from DP perspective each user is contributing one gradient B = num_clients_curr_iter # batch size n = num_clients T = curr_iter + 1 _delta = dp_config.get('delta', min(1e-7, 1. / (n * log(n)))) # TODO should be precomputed in config if dp_config.get('global_sigma', None) is None: max_sensitivity = np.sqrt(dp_config['max_grad'] ** 2 + dp_config['max_weight'] ** 2) noise_scale = compute_LDP_noise_std(dp_config['eps'], max_sensitivity, _delta) global_sigma = noise_scale * np.sqrt(B) / max_sensitivity else: global_sigma = dp_config['global_sigma'] noise_scale = global_sigma * dp_config['max_grad'] / B try: mu = K * B / n * sqrt(T * exp((1. / global_sigma) ** 2 - 1)) except OverflowError: print_rank(f"Error computing mu {global_sigma} {K} {B} {n} {T}") mu = -1 orders = ([1.25, 1.5, 1.75, 2., 2.25, 2.5, 3., 3.5, 4., 4.5] + list(range(5, 64)) + [128, 256, 512]) q = B / n _sigma = global_sigma # was: noise_scale but we should apply the noise multiplier. rdp = privacy_analysis.compute_rdp(q, _sigma, T, orders) rdp_epsilon, opt_order = privacy_analysis.get_privacy_spent(orders, rdp, _delta) props = { 'dp_global_K': K, # gradients per user 'dp_global_B': B, # users per batch 'dp_global_n': n, # total users 'dp_global_T': T, # how many iterations 'dp_sigma': _sigma, # noise_multiplier. Should be combined global+local sigma. 'dp_global_mu': mu, # 'dp_epsilon_fdp': fdp_epsilon, 'dp_epsilon_rdp': rdp_epsilon, # 'dp_epsilon_exact': exact_eps, 'dp_opt_order': opt_order, 'dp_delta': _delta, 'dp_noise_scale': noise_scale # Note: not needed for accounting. } print_rank(f'DP accounting: {json.dumps(props)}') for k in props: run.log(k, props[k]) return rdp_epsilon else: return None ================================================ FILE: extensions/privacy/analysis.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ *Borrowed from Facebook Opacus, which in turn borrowed from Tensorflow Privacy. *Facebook's original notice follows below. *Based on Google's TF Privacy:* https://github.com/tensorflow/privacy/blob/master/tensorflow_privacy/privacy/analysis/rdp_accountant.py. *Here, we update this code to Python 3, and optimize dependencies.* Functionality for computing Renyi Differential Privacy (RDP) of an additive Sampled Gaussian Mechanism (SGM). Example: Suppose that we have run an SGM applied to a function with L2-sensitivity of 1. Its parameters are given as a list of tuples ``[(q_1, sigma_1, steps_1), ..., (q_k, sigma_k, steps_k)],`` and we wish to compute epsilon for a given target delta. The example code would be: >>> max_order = 32 >>> orders = range(2, max_order + 1) >>> rdp = np.zeros_like(orders, dtype=float) >>> for q, sigma, steps in parameters: >>> rdp += privacy_analysis.compute_rdp(q, sigma, steps, orders) >>> epsilon, opt_order = privacy_analysis.get_privacy_spent(orders, rdp, delta) """ import math import numpy as np from scipy import special from typing import List, Tuple, Union ######################## # LOG-SPACE ARITHMETIC # ######################## def _log_add(logx: float, logy: float) -> float: r"""Adds two numbers in the log space. Args: logx: First term in log space. logy: Second term in log space. Returns: Sum of numbers in log space. """ a, b = min(logx, logy), max(logx, logy) if a == -np.inf: # adding 0 return b # Use exp(a) + exp(b) = (exp(a - b) + 1) * exp(b) return math.log1p(math.exp(a - b)) + b # log1p(x) = log(x + 1) def _log_sub(logx: float, logy: float) -> float: r"""Subtracts two numbers in the log space. Args: logx: First term in log space. Expected to be greater than the second term. logy: First term in log space. Expected to be less than the first term. Returns: Difference of numbers in log space. Raises: ValueError If the result is negative. """ if logx < logy: raise ValueError("The result of subtraction must be non-negative.") if logy == -np.inf: # subtracting 0 return logx if logx == logy: return -np.inf # 0 is represented as -np.inf in the log space. try: # Use exp(x) - exp(y) = (exp(x - y) - 1) * exp(y). return math.log(math.expm1(logx - logy)) + logy # expm1(x) = exp(x) - 1 except OverflowError: return logx def _compute_log_a_for_int_alpha(q: float, sigma: float, alpha: int) -> float: r"""Computes :math:`log(A_\alpha)` for integer ``alpha``. Notes: Note that :math:`A_\alpha` is real valued function of ``alpha`` and ``q``, and that 0 < ``q`` < 1. Refer to Section 3.3 of https://arxiv.org/pdf/1908.10530.pdf for details. Args: q: Sampling rate of SGM. sigma: The standard deviation of the additive Gaussian noise. alpha: The order at which RDP is computed. Returns: :math:`log(A_\alpha)` as defined in Section 3.3 of https://arxiv.org/pdf/1908.10530.pdf. """ # Initialize with 0 in the log space. log_a = -np.inf for i in range(alpha + 1): log_coef_i = ( math.log(special.binom(alpha, i)) + i * math.log(q) + (alpha - i) * math.log(1 - q) ) s = log_coef_i + (i * i - i) / (2 * (sigma ** 2)) log_a = _log_add(log_a, s) return float(log_a) def _compute_log_a_for_frac_alpha(q: float, sigma: float, alpha: float) -> float: r"""Computes :math:`log(A_\alpha)` for fractional ``alpha``. Notes: Note that :math:`A_\alpha` is real valued function of ``alpha`` and ``q``, and that 0 < ``q`` < 1. Refer to Section 3.3 of https://arxiv.org/pdf/1908.10530.pdf for details. Args: q: Sampling rate of SGM. sigma: The standard deviation of the additive Gaussian noise. alpha: The order at which RDP is computed. Returns: :math:`log(A_\alpha)` as defined in Section 3.3 of https://arxiv.org/pdf/1908.10530.pdf. """ # The two parts of A_alpha, integrals over (-inf,z0] and [z0, +inf), are # initialized to 0 in the log space: log_a0, log_a1 = -np.inf, -np.inf i = 0 z0 = sigma ** 2 * math.log(1 / q - 1) + 0.5 while True: # do ... until loop coef = special.binom(alpha, i) log_coef = math.log(abs(coef)) j = alpha - i log_t0 = log_coef + i * math.log(q) + j * math.log(1 - q) log_t1 = log_coef + j * math.log(q) + i * math.log(1 - q) log_e0 = math.log(0.5) + _log_erfc((i - z0) / (math.sqrt(2) * sigma)) log_e1 = math.log(0.5) + _log_erfc((z0 - j) / (math.sqrt(2) * sigma)) log_s0 = log_t0 + (i * i - i) / (2 * (sigma ** 2)) + log_e0 log_s1 = log_t1 + (j * j - j) / (2 * (sigma ** 2)) + log_e1 if coef > 0: log_a0 = _log_add(log_a0, log_s0) log_a1 = _log_add(log_a1, log_s1) else: log_a0 = _log_sub(log_a0, log_s0) log_a1 = _log_sub(log_a1, log_s1) i += 1 if max(log_s0, log_s1) < -30: break return _log_add(log_a0, log_a1) def _compute_log_a(q: float, sigma: float, alpha: float) -> float: r"""Computes :math:`log(A_\alpha)` for any positive finite ``alpha``. Notes: Note that :math:`A_\alpha` is real valued function of ``alpha`` and ``q``, and that 0 < ``q`` < 1. Refer to Section 3.3 of https://arxiv.org/pdf/1908.10530.pdf for details. Args: q: Sampling rate of SGM. sigma: The standard deviation of the additive Gaussian noise. alpha: The order at which RDP is computed. Returns: :math:`log(A_\alpha)` as defined in the paper mentioned above. """ if float(alpha).is_integer(): return _compute_log_a_for_int_alpha(q, sigma, int(alpha)) else: return _compute_log_a_for_frac_alpha(q, sigma, alpha) def _log_erfc(x: float) -> float: r"""Computes :math:`log(erfc(x))` with high accuracy for large ``x``. Helper function used in computation of :math:`log(A_\alpha)` for a fractional alpha. Args: x: The input to the function Returns: :math:`log(erfc(x))` """ return math.log(2) + special.log_ndtr(-x * 2 ** 0.5) def _compute_rdp(q: float, sigma: float, alpha: float) -> float: r"""Computes RDP of the Sampled Gaussian Mechanism at order ``alpha``. Args: q: Sampling rate of SGM. sigma: The standard deviation of the additive Gaussian noise. alpha: The order at which RDP is computed. Returns: RDP at order ``alpha``; can be np.inf. """ if q == 0: return 0 # no privacy if sigma == 0: return np.inf if q == 1.0: return alpha / (2 * sigma ** 2) if np.isinf(alpha): return np.inf return _compute_log_a(q, sigma, alpha) / (alpha - 1) def compute_rdp( q: float, noise_multiplier: float, steps: int, orders: Union[List[float], float] ) -> Union[List[float], float]: r"""Computes Renyi Differential Privacy (RDP) guarantees of the Sampled Gaussian Mechanism (SGM) iterated ``steps`` times. Args: q: Sampling rate of SGM. noise_multiplier: The ratio of the standard deviation of the additive Gaussian noise to the L2-sensitivity of the function to which it is added. Note that this is same as the standard deviation of the additive Gaussian noise when the L2-sensitivity of the function is 1. steps: The number of iterations of the mechanism. orders: An array (or a scalar) of RDP orders. Returns: The RDP guarantees at all orders; can be ``np.inf``. """ if isinstance(orders, float): rdp = _compute_rdp(q, noise_multiplier, orders) else: rdp = np.array([_compute_rdp(q, noise_multiplier, order) for order in orders]) return rdp * steps def get_privacy_spent( orders: Union[List[float], float], rdp: Union[List[float], float], delta: float ) -> Tuple[float, float]: r"""Computes epsilon given a list of Renyi Differential Privacy (RDP) values at multiple RDP orders and target ``delta``. Args: orders: An array (or a scalar) of orders (alphas). rdp: A list (or a scalar) of RDP guarantees. delta: The target delta. Returns: Pair of epsilon and optimal order alpha. Raises: ValueError If the lengths of ``orders`` and ``rdp`` are not equal. """ orders_vec = np.atleast_1d(orders) rdp_vec = np.atleast_1d(rdp) if len(orders_vec) != len(rdp_vec): raise ValueError( f"Input lists must have the same length.\n" f"\torders_vec = {orders_vec}\n" f"\trdp_vec = {rdp_vec}\n" ) eps = rdp_vec - math.log(delta) / (orders_vec - 1) # special case when there is no privacy if np.isnan(eps).all(): return np.inf, np.nan idx_opt = np.nanargmin(eps) # Ignore NaNs return eps[idx_opt], orders_vec[idx_opt] ================================================ FILE: extensions/privacy/dp_kmeans.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import sys import numpy as np from scipy.special import gammainc from sklearn.cluster import KMeans from sklearn import cluster as skcluster kmeans_single = skcluster._kmeans.lloyd_iter_chunked_dense def sample(ndim, r, num_samples=1): x = np.random.normal(size=(num_samples, ndim)) ssq = np.sum(x**2,axis=1) fr = r*gammainc(ndim/2,ssq/2)**(1/ndim)/np.sqrt(ssq) if num_samples > 1: fr = np.tile(fr.reshape(num_samples,1),(1,ndim)) return np.multiply(x,fr) def sphere_packing_initialization(n_clusters, n_dim, min_cluster_radius, max_space_size, max_failed_cases, verbose=None): a, max_r = min_cluster_radius, max_space_size centers = np.empty((n_clusters, n_dim)) cluster_id = 0 fail_count = 0 r = max_r - a while cluster_id < n_clusters: v = sample(n_dim, r) if cluster_id > 0 and np.min(np.linalg.norm(centers[:cluster_id, :] - v, axis=-1)) < 2 * a: fail_count += 1 if fail_count >= max_failed_cases: fail_count = 0 cluster_id = 0 a = a / 2 # TODO Use binary search to find maximum a that don't fail (vaguely discribed in the diff-p kmeas paper) if verbose: print(f'Failing to pack, halving min_cluster_radius to {a}') r = max_r - a continue centers[cluster_id] = v cluster_id += 1 if verbose: print('Final min_cluster_radius', a) return centers, a def add_gaussian_noise(centers_new, weight_in_clusters, eps, max_cluster_l2, max_sample_weight, cluster_to_weight_ratio=-1, delta=1e-7, verbose=None): scaler = 1 if cluster_to_weight_ratio > 0: # Compute the scaler to apply to the sample weights scaler = max_cluster_l2 / (max_sample_weight * cluster_to_weight_ratio) max_sample_weight *= scaler max_l2_sensitivity = np.sqrt(max_cluster_l2 ** 2 + max_sample_weight ** 2) sigma = np.sqrt(2 * np.log(1.25 / delta)) * max_l2_sensitivity / eps if verbose: print('cluster_to_weight_ratio', cluster_to_weight_ratio, 'scaler', scaler, 'max_sample_weight', max_sample_weight, 'max_l2_sensitivity', max_l2_sensitivity, 'sigma', sigma) centers_sum = (centers_new * weight_in_clusters.reshape(-1, 1)) + np.random.normal(scale=sigma, size=centers_new.shape) # Scale the sample weights by scaling the cluster weights, since (s*w1 + s*w2, ...) == s*(w1 + w2 + ...), where s is the scaler # Add noise then rescale back. We should never get negative weights because of the noise weight_in_clusters[:] = np.maximum(1e-10, (weight_in_clusters * scaler) + np.random.normal(scale=sigma, size=weight_in_clusters.shape)) / scaler centers_new[:] = centers_sum / weight_in_clusters.reshape(-1, 1) def DPKMeans(n_dim, eps, max_cluster_l2, max_sample_weight=1.0, max_iter=300, cluster_to_weight_ratio=-1, n_clusters=8, tol=1e-4, verbose=0, delta=1e-7, max_failed_cases=300, min_cluster_radius=None, **kwargs): """Differentially private KMeans Initialise the differentially-private Sklearn.cluster.KMeans overriding lloyd algorithm, by adding Gaussian noise. Parameters --------- n_dim : int The dimension size of the input space eps : float The privacy loss (epsilon) per iteration. Currently only fix epsilon is implemented so the overall privacy loss <= eps * max_iter max_cluster_l2 : float The maximum l2 norm of any example vector that we want to cluster max_sample_weight : float The maximum weight of a sample default=1.0 max_iter : int, default=300 Maximum number of iterations of the k-means algorithm for a single run. cluster_to_weight_ratio : float, default=-1 The ratio max_cluster_l2 / max_sample_weight used to scale the cluster counts before adding the noise If it is set to -1, do not scale the counts n_clusters : int, default=8 The number of clusters to form as well as the number of centroids to generate. tol : float, default=1e-4 Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two consecutive iterations to declare convergence. verbose : int, default=0 Verbosity mode. delta : float, default=1e-7 Gaussian mechanism delta or probability of failure, should be set < 1/num of examples max_failed_cases : int, default=300 The number of sampling trails in sphere packing before halving the minimum cluster radius min_cluster_radius : float, default=None (= max_cluster_l2 / n_clusters) Half the minimum distance between clusters centers """ if min_cluster_radius is None: min_cluster_radius = max_cluster_l2 / n_clusters # Initalise the cluster centers using sphere packing init_centers, min_cluster_radius = sphere_packing_initialization(n_clusters, n_dim, min_cluster_radius, max_cluster_l2, max_failed_cases, verbose) final_eps = [0] # To keep track of the actual number of iterations until convergence def modified_lloyd(X, sample_weight, x_squared_norms, centers, centers_new, weight_in_clusters, labels, center_shift, n_threads, update_centers=True): # Clip the maximum client contribution to the cluster count sample_weight = np.minimum(sample_weight, max_sample_weight) if not update_centers: return kmeans_single(X, sample_weight, x_squared_norms, centers, centers_new, weight_in_clusters, labels, center_shift, n_threads, update_centers=False) # Scale input vectors if necessary if np.max(x_squared_norms) > max_cluster_l2 ** 2: if verbose: print(f'Scaling the input examples as their l2 norm is larger than {max_cluster_l2}') scaler_squared = np.minimum(max_cluster_l2 ** 2 / x_squared_norms, 1.0) x_squared_norms[:] = x_squared_norms * scaler_squared X[:] = X * np.sqrt(scaler_squared).reshape(-1, 1) kmeans_single(X, sample_weight, x_squared_norms, centers, centers_new, weight_in_clusters, labels, center_shift, n_threads) # Add noise to centers_new add_gaussian_noise(centers_new, weight_in_clusters, eps, max_cluster_l2, max_sample_weight, cluster_to_weight_ratio, delta=delta, verbose=verbose) # Other values need to be changed because of that: center_shift, labels, center_shift[:] = np.linalg.norm(centers - centers_new, axis=-1) # Run E-step of kmeans to get the new labels kmeans_single(X, sample_weight, x_squared_norms, centers, centers_new, weight_in_clusters, labels, center_shift, n_threads, update_centers=False) # Increment the number of iterations final_eps[0] += eps sys.modules[KMeans.__module__].lloyd_iter_chunked_dense = modified_lloyd kmeans = KMeans(n_clusters=n_clusters, algorithm='full', init=init_centers, verbose=verbose, max_iter=max_iter, tol=tol, **kwargs) kmeans.eps = final_eps return kmeans def resetKMeans(): sys.modules[KMeans.__module__].lloyd_iter_chunked_dense = kmeans_single ================================================ FILE: extensions/privacy/metrics.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import logging import numpy as np import torch as T from copy import deepcopy from utils import make_optimizer, print_rank def extract_indices_from_embeddings(gradients, batch, embed_size, vocab_size): # Extract the Input gradient embeddings batch = T.cat([b.view(-1) for b in batch]).cpu().detach().numpy() embed_grad = gradients[:embed_size * vocab_size].reshape(vocab_size, embed_size) valid_batch = batch[batch > 0] tot_valid_tokens, tot_tokens = len(valid_batch), len(batch) # The embedding gradients of the indices seen in the batch have higher l2 norm, # because dl/dembed_i = dl/dembed_input_i * (if word_i is in batch) + dl/dembed_output_i extracted_indices = T.argsort(embed_grad.norm(dim=-1), descending=True)[:tot_tokens].cpu().detach().numpy() # Get the overlap ratio extracted_ratio = np.isin(valid_batch, extracted_indices).mean() # Find True positive extracted indices return extracted_ratio, np.intersect1d(extracted_indices, valid_batch) def compute_perplexity(encoded_batch, model): outputs = model.inference(encoded_batch) (batch_size, seq_len, vocab_size) = outputs['output'].shape perplex = T.nn.functional.log_softmax(outputs['output'], dim=-1) return perplex.reshape(-1, vocab_size)[np.arange(batch_size * seq_len), encoded_batch.reshape(-1)].reshape(batch_size, seq_len) def practical_epsilon_leakage(original_params, model, encoded_batches, is_weighted_leakage=True, max_ratio=1e9, optimizer_config=None): # Copy the gradients and save the model. current_params = deepcopy(model.state_dict()) current_gradients = dict((n,p.grad.clone().detach()) for n,p in model.named_parameters()) model.load_state_dict(original_params) pre_perplex, post_perplex = [], [] # This is just to initialise the gradients model.loss(encoded_batches[0][:1]).backward() model.zero_grad() tolerance = 1 / max_ratio max_leakage = 0 with T.no_grad(): # Original model before training on client for encoded_batch in encoded_batches: pre_perplex.append(compute_perplexity(encoded_batch, model)) # The attacker doesn't not he optimal gradient magnitude but using Adamax with high lr, is proved to be effective for n, p in model.named_parameters(): p.grad = current_gradients[n] #.grad print_rank('grad l2: {}'.format(p.grad), loglevel=logging.DEBUG) if optimizer_config is None: optimizer_config = {'lr': 0.03, 'amsgrad': False, 'type': 'adamax'} #T.optim.Adamax(model.parameters(), lr=optim_lr).step() make_optimizer(optimizer_config, model).step() #model.zero_grad() # The model after training on the client data for encoded_batch in encoded_batches: post_perplex.append(compute_perplexity(encoded_batch, model)) for pre, post in zip(pre_perplex, post_perplex): # Compute the ratio of preplexity and weight it be the probability of correctly predicting the word leakage = ((pre + tolerance) / (post + tolerance)).clamp_(0, max_ratio) print_rank('perplexities leakage: {} '.format(leakage), loglevel=logging.DEBUG) if is_weighted_leakage: weight_leakage = T.max(pre.exp(), post.exp()) * leakage else: weight_leakage = leakage max_leakage = max(max_leakage, weight_leakage.max().item()) print_rank('raw max leakage: {}'.format(max_leakage), loglevel=logging.DEBUG) model.load_state_dict(current_params) for n,p in model.named_parameters(): p.grad = current_gradients[n] # WE return the log to match epsilon return max(np.log(max_leakage), 0) ================================================ FILE: extensions/quantization/quant.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import logging import torch from utils import print_rank from typing import Optional, Tuple def quant_model( model: torch.nn.Module, quant_bits: int = 8, quant_threshold: Optional[int] = None, global_stats: bool = False ): '''Quantize the gradients using the desired number of bits. Nothing is returned as gradients inside :code:`model` are modified in-place. Args: model: model which gradients we want to quantize. quant_bits: how many bits will we use to quantize the gradients. quant_threshold: fraction of components to be set to zero; defaults to None, in which case no quantization happens. global_stats: use a single histogram for all layers when binning, defaults to False. ''' # If no `quant_threshold`, does nothing if quant_threshold is None: return print_rank('Performing Gradient Quantization with Prob. Threshold: {}'.format( quant_threshold), loglevel=logging.INFO) # If `global_stats` is true, min/max and thresh are computed across all layers if global_stats: flattened_grad = torch.cat([p.grad.data.flatten() for p in model.parameters()]) min_grad, max_grad, thresh = find_min_max_gradient(flattened_grad, quant_threshold) # Loop through all layers for p in model.parameters(): if not global_stats: min_grad, max_grad, thresh = find_min_max_gradient(p.grad.data, quant_threshold) # Perform binning and sparsification of components binned_grad = quant_bins(p.grad.data, 2 ** quant_bits, min_grad, max_grad) p.grad = torch.where(torch.abs(p.grad.data) > thresh, binned_grad, torch.tensor(0.).to(p.grad)) def find_min_max_gradient( gradient: torch.Tensor, quant_threshold: Optional[float] = None ) -> Tuple[float, float, float]: '''Get min and max gradients, as well as threshold gradient. Args: gradient: tensor over which statistics will be computed. quant_threshold: which quantile to look for to compute threshold, must be between 0 and 1. ''' # Computes min/max and quantile corresponding to `quant_threshold` min_grad, max_grad = gradient.min(), gradient.max() thresh = torch.quantile(torch.abs(gradient), quant_threshold) print_rank('Min. and Max. Gradients: {}, {}'.format(min_grad, max_grad), loglevel=logging.INFO) print_rank('Grad. Threshold: {}'.format(thresh), loglevel=logging.INFO) return min_grad, max_grad, thresh def quant_bins( gradients: torch.Tensor, n_bins: int, min_grad: float, max_grad: float ) -> torch.Tensor: '''Perform quantization using binning. Creates histogram with `n_bins` bins between `min_grad` and `max_grad`. Returns a tensor similar to gradients but with components corresponding to bin labels. Args: gradients: tensor we want to quantize. n_bins: how many bins to use for binning. min_grad: min. value for bins. max_grad: max. value for bins. ''' # We remove half bin width, as bucketize always takes the ceil instead of rounding bin_labels = torch.linspace(min_grad, max_grad, n_bins).to(gradients) bin_width = bin_labels[1] - bin_labels[0] grad_bins = torch.bucketize(gradients - .5 * bin_width, bin_labels, right=False) return bin_labels[grad_bins] ================================================ FILE: requirements.txt ================================================ torch==1.11.0 mpi4py easydict scipy psutil transformers torchvision pandas h5py sphinx_rtd_theme azureml-core azureml-defaults pyyaml scikit-learn cerberus protobuf sentencepiece googledrivedownloader wget ================================================ FILE: testing/README.md ================================================ ## Information The tests are designed to evaluate the operation of the tasks, not the performance. Therefore, we are using dummy data to run all tasks. In order to have ralistic results about the behaviour of each experiment, please follow the instructions provided in the README.md file inside each experiment folder, for downloading the recommended datasets. ## Setup Instructions for Pytest 1. Run create_data.py in order to download and preprocess the dummy training and testing datasets that will be used. Make sure to indicate the task name. The example below shows how to create the data for the ```nlg_gru``` task. ``` python python create_data.py --task nlg_gru ``` 2. The script ```test_e2e_trainer.py``` is designed to run the test over all tasks, therefore you need to run Step 1 for each experiment first). 3. Run ```pytest -v -s``` to perfor the local test. ================================================ FILE: testing/build_vocab.py ================================================ """Builds vocabulary file from data.""" import argparse import collections import json import os def build_counter(train_data, initial_counter=None): train_tokens = [] for u in train_data: for c in train_data[u]['x']: train_tokens.extend([s for s in c]) all_tokens = [] for i in train_tokens: all_tokens.extend(i) train_tokens = [] if initial_counter is None: counter = collections.Counter() else: counter = initial_counter counter.update(all_tokens) all_tokens = [] return counter def build_vocab(counter, vocab_size=10000): pad_symbol, unk_symbol = 0, 1 count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) count_pairs = count_pairs[:(vocab_size - 2)] # -2 to account for the unknown and pad symbols words, _ = list(zip(*count_pairs)) vocab = {} vocab[''] = pad_symbol vocab[''] = unk_symbol for i, w in enumerate(words): if w != '': vocab[w] = i + 1 return {'vocab': vocab, 'size': vocab_size, 'unk_symbol': unk_symbol, 'pad_symbol': pad_symbol} def load_leaf_data(file_path): with open(file_path) as json_file: data = json.load(json_file) to_ret = data['user_data'] data = None return to_ret def save_vocab(vocab, target_dir): os.makedirs(target_dir, exist_ok=True) with open('./models/vocab_reddit.vocab', 'w') as outV: outV.write('\n') for t in vocab['vocab'].keys(): outV.write(t+'\n') def main(): args = parse_args() json_files = [f for f in os.listdir(args.data_dir) if f.endswith('.json')] json_files.sort() counter = None train_data = {} for f in json_files: print('loading {}'.format(f)) train_data = load_leaf_data(os.path.join(args.data_dir, f)) print('counting {}'.format(f)) counter = build_counter(train_data, initial_counter=counter) print() train_data = {} if counter is not None: vocab = build_vocab(counter, vocab_size=args.vocab_size) save_vocab(vocab, args.target_dir) else: print('No files to process.') def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--data-dir', help='dir with training file;', type=str, required=True) parser.add_argument('--vocab-size', help='size of the vocabulary;', type=int, default=10000, required=False) parser.add_argument('--target-dir', help='dir with training file;', type=str, default='./', required=False) return parser.parse_args() if __name__ == '__main__': main() ================================================ FILE: testing/create_data.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import os import csv import json import random import argparse import platform from collections import OrderedDict from itertools import islice import tqdm import h5py import torchvision import torchvision.transforms as transforms from google_drive_downloader import GoogleDriveDownloader as gdd def get_arg_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() parser.add_argument("--task") return parser def reduce_users(file): with open(file, 'r') as f: json_file = json.load(f) num_samples = json_file['num_samples'][0:25] user_data = dict(OrderedDict(islice(json_file['user_data'].items(), 0, 25))) users_list = list(user_data.keys()) return users_list, num_samples, user_data def _process_and_save_to_disk(dataset, n_users, exp, output): '''Process a Torchvision dataset to expected format and save to disk''' # Split training data equally among all users total_samples = len(dataset) samples_per_user = total_samples // n_users assert total_samples % n_users == 0 # Function for getting a given user's data indices user_idxs = lambda user_id: slice(user_id * samples_per_user, (user_id + 1) * samples_per_user) data_dict = { # the data is expected to have this format 'users' : [f'{user_id:04d}' for user_id in range(n_users)], 'num_samples' : n_users * [samples_per_user], 'user_data' : {f'{user_id:04d}': dataset.data[user_idxs(user_id)].tolist() if exp =="classif_cnn" else dataset.data[user_idxs(user_id)] for user_id in range(n_users)}, 'user_data_label': {f'{user_id:04d}': dataset.targets[user_idxs(user_id)] for user_id in range(n_users)}, } with h5py.File(output + '.hdf5', 'w') as hdf5_file: _dump_dict_to_hdf5(data_dict=data_dict, hdf5_file=hdf5_file) def _dump_dict_to_hdf5(data_dict: dict, hdf5_file: h5py.File): '''Dump dict with expected structure to HDF5 file''' hdf5_file.create_dataset('users', data=data_dict['users']) hdf5_file.create_dataset('num_samples', data=data_dict['num_samples']) # Store actual data in groups user_data_group = hdf5_file.create_group('user_data') for user, user_data in tqdm.tqdm(data_dict['user_data'].items()): user_subgroup = user_data_group.create_group(user) user_subgroup.create_dataset('x', data=user_data) user_data_label_group = hdf5_file.create_group('user_data_label') for user, user_data_label in tqdm.tqdm(data_dict['user_data_label'].items()): user_data_label_group.create_dataset(user, data=user_data_label) class HeartDataSet: def __init__(self, heartdata, cutoff): self.data = [row[:187] for row in heartdata][:cutoff] self.targets = [int(float(row[187])) for row in heartdata][:(round(len(heartdata), -3))][:cutoff] def __len__(self): return len(self.data) def main(): parser = get_arg_parser() args = parser.parse_args() args = vars(args) exp = args["task"] # Create data folder os.system("mkdir data") if exp == "nlg_gru" or exp == "mlm_bert": # Download preprocessed reddit dataset by LEAF: A Benchmark for Federated Settings gdd.download_file_from_google_drive(file_id='1ISzp69JmaIJqBpQCX-JJ8-kVyUns8M7o', dest_path='./data/nlg_gru.zip', unzip=True) files = ["train_data", "val_data", "test_data"] for file in files: orig_file = os.path.join("data","new_small_data",str(file+".json")) users_list, num_samples, user_data = reduce_users(orig_file) # Preprocess data if exp == "nlg_gru": os.makedirs("data/nlg_gru", exist_ok= True) if platform.system() == "Windows" else os.system("mkdir data/nlg_gru") for users in user_data: listToStr = '' for i, sentences in enumerate(user_data[users]['x']): for j, pieces in enumerate(sentences): listToStr = ' '.join([elem for elem in pieces]) user_data[users]['x'][i][j] = listToStr full_sentence = ' '.join([elem for elem in sentences]) full_sentence = full_sentence.replace('', '').replace('', '').replace('', '').strip() user_data[users]['x'][i] = full_sentence user_data[users].pop('y',None) elif exp == "mlm_bert": os.makedirs("data/mlm_bert", exist_ok= True) if platform.system() == "Windows" else os.system("mkdir data/mlm_bert") user_data_aux = dict() for users in user_data: listToStr = '' for i, sentences in enumerate(user_data[users]['x']): for j, pieces in enumerate(sentences): listToStr = ' '.join([elem for elem in pieces]) listToStr = listToStr.replace('', '').replace('', '').replace('', '').strip() user_data[users]['x'][i][j] = listToStr user_data[users].pop('y',None) user_data_aux[users] = user_data[users]['x'] user_data = user_data_aux # Create new dictionary new_dict = {'users':users_list ,'num_samples':num_samples, 'user_data':user_data} # Save preprocessed files ext = ".json" if exp=="nlg_gru" else ".txt" new_file = os.path.join("data",exp,str(file+ ext)) f = open(new_file,'w') json.dump(new_dict,f) f.close() # Build vocabulary os.system(str("python build_vocab.py --data-dir ./data/"+ exp +" --target-dir ./models")) elif exp == "classif_cnn": os.makedirs("data/classif_cnn", exist_ok= True) if platform.system() == "Windows" else os.system("mkdir data/classif_cnn") # Get training and testing sets from torchvision transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True,download=True, transform=transform) testset = torchvision.datasets.CIFAR10(root='./data', train=False,download=True, transform=transform) # Saving datasets _process_and_save_to_disk(trainset, n_users=50, exp=exp, output='./data/classif_cnn/train_data') _process_and_save_to_disk(testset, n_users=50, exp=exp, output='./data/classif_cnn/test_data') elif exp == "ecg_cnn": os.makedirs("data/ecg_cnn", exist_ok= True) if platform.system() == "Windows" else os.system("mkdir data/ecg_cnn") # Create dummy datasets for set in ['train_data.csv', 'test_data.csv']: data= [random.random() for i in range(188)] with open(os.path.join('data',exp,set), 'w', newline='') as f: write = csv.writer(f) for row in range(87554): write.writerow(data) # Preprocess datasets for set in ['train_data', 'test_data']: with open(os.path.join('data',exp,str(set+".csv"))) as f: testset = list(csv.reader(f , delimiter=',')) TestDataset = HeartDataSet(testset, 21000) _process_and_save_to_disk(TestDataset,1000,exp,os.path.join('data',exp,set)) if __name__ == '__main__': main() ================================================ FILE: testing/hello_world_classif_cnn.yaml ================================================ # Basic configuration file for running classif_cnn example using hdf5 files. # Parameters needed to initialize the model model_config: model_type: CNN # class w/ `loss` and `inference` methods model_folder: experiments/classif_cnn/model.py # file containing class # Configuration for differential privacy dp_config: enable_local_dp: false # whether to enable user-level DP # Additional privacy metrics privacy_metrics_config: apply_metrics: false # cache data to compute additional metrics # Select the Federated optimizer to use (e.g. DGA or FedAvg) strategy: DGA # Determines all the server-side settings for training and evaluation rounds server_config: wantRL: false # whether to use RL-based meta-optimizers resume_from_checkpoint: false # restart from checkpoint if file exists do_profiling: false # run profiler and compute runtime metrics optimizer_config: # this is the optimizer used to update the model type: sgd lr: 1.0 annealing_config: # annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 1 # how many iterations between metric eval on val set rec_freq: 5 # how many iterations between metric eval on test set initial_val: true initial_rec: true max_iteration: 3 # how many iterations in total num_clients_per_iteration: 3 # how many clients per iteration data_config: # where to get val and test data from val: batch_size: 10000 val_data: null test: batch_size: 10000 test_data: null type: model_optimization aggregate_median: softmax # how aggregations weights are computed initial_lr_client: 0.001 # learning rate used on client optimizer lr_decay_factor: 1.0 weight_train_loss: train_loss best_model_criterion: f1_score fall_back_to_best_model: false softmax_beta: 1.0 # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: do_profiling: false # run profiling and compute runtime metrics ignore_subtask: false data_config: # where to get training data from train: batch_size: 4 list_of_train_data: null desired_max_samples: 50000 optimizer_config: # this is the optimizer used by the client type: sgd lr: 0.001 # this is overridden by `initial_lr_client` momentum: 0.9 type: optimization ================================================ FILE: testing/hello_world_ecg_cnn.yaml ================================================ # Basic configuration file for running ecg_cnn example using json files. # Parameters needed to initialize the model model_config: model_type: SuperNet # class w/ `loss` and `inference` methods model_folder: experiments/ecg_cnn/model.py # file containing class # Configuration for differential privacy dp_config: enable_local_dp: false # whether to enable user-level DP # Additional privacy metrics privacy_metrics_config: apply_metrics: false # cache data to compute additional metrics # Select the Federated optimizer to use (e.g. DGA or FedAvg) strategy: DGA # Determines all the server-side settings for training and evaluation rounds server_config: wantRL: false # whether to use RL-based meta-optimizers resume_from_checkpoint: false # restart from checkpoint if file exists do_profiling: false # run profiler and compute runtime metrics optimizer_config: # this is the optimizer used to update the model type: sgd lr: 1.0 annealing_config: # annealer for the learning rate type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 5 # how many iterations between metric eval on val set rec_freq: 3 # how many iterations between metric eval on test set initial_val: false initial_rec: false max_iteration: 3 # how many iterations in total num_clients_per_iteration: 3 # how many clients per iteration data_config: # where to get val and test data from val: batch_size: 10000 val_data: data/ecg_cnn/test_data.hdf5 test: batch_size: 10000 test_data: data/ecg_cnn/test_data.hdf5 type: model_optimization aggregate_median: softmax # how aggregations weights are computed softmax_beta: 20.0 initial_lr_client: 0.001 # learning rate used on client optimizer lr_decay_factor: 1.0 weight_train_loss: train_loss best_model_criterion: loss fall_back_to_best_model: false # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: do_profiling: false # run profiling and compute runtime metrics ignore_subtask: false data_config: # where to get training data from train: batch_size: 96 list_of_train_data: data/ecg_cnn/train_data.hdf5 desired_max_samples: 87000 optimizer_config: # this is the optimizer used by the client type: sgd lr: 0.001 # this is overridden by `initial_lr_client` momentum: 0.90 type: optimization ================================================ FILE: testing/hello_world_mlm_bert.yaml ================================================ # Basic configuration file for running mlm_bert example using json files. # Parameters needed to initialize the model model_config: model_type: BERT model_folder: experiments/mlm_bert/model.py BERT: loader_type: text model: model_name: roberta-large cache_dir: ./cache_dir use_fast_tokenizer: False mask_token: task: mlm past_index: -1 prediction_loss_only: false process_line_by_line: false training: seed: 12345 label_smoothing_factor: 0 batch_size: 64 max_seq_length: 256 # Configuration for differential privacy dp_config: enable_local_dp: false # If enabled, the rest of parameters is needed. enable_global_dp: false # Local dp clips and adds noise on the client and centrally accumulates the privacy budget eps: 100 # epsilon global_sigma: 0.35 # Used when global dp es enabled, specifies the global Gaussian noise weight_scaler: 0.0001 # indicates how the aggregation weights scaled before noise addition, and unscaled afterwards. max_grad: 0.008 # max gradient max_weight: 0.5 # The max_weight and min_weight should be already scaled by weight_scaler min_weight: 0.0000001 # Because we scale down the weight using weight_scalar -> clip -> add noise -> scale back up. # Additional privacy metrics privacy_metrics_config: apply_metrics: false # If enabled, the rest of parameters is needed. # Select the Federated optimizer to use (e.g. DGA or FedAvg) strategy: DGA # Determines all the server-side settings for training and evaluation rounds server_config: resume_from_checkpoint: true # Resumes from latest checkpoint iteration if available do_profiling: false # Capture profiling information during server updates. wantRL: false # Enable/Disable Reinforcement learning optimizer_config: # Configuration for server-side optimizer lr: 0.00001 weight_decay: 0.01 type: adamW annealing_config: # This section configures how the learning rate decays type: step_lr step_interval: epoch gamma: 1.0 step_size: 1000 val_freq: 5 # Frequency for validation rounds rec_freq: 5 # Frequency for testing rounds initial_val : false # Enable initial validation round at itr=0 initial_rec: false # Enable initial testing round at itr=0 max_iteration: 2 # Total number of rounds for FL num_clients_per_iteration: 2 # Number of clients sampled per round data_config: # Server-side data configuration val: # Validation data val_data: data/mlm_bert/val_data.txt task: mlm mlm_probability: 0.25 tokenizer_type_fast: False batch_size: 128 max_seq_length: 256 min_words_per_utt: 5 max_samples_per_user: 5000 mask_token: num_workers: 0 prepend_datapath: false cache_dir: ./cache_dir # Note this is NOT the main training data configuration, which is configured in the # client config. This section is ignored unless you are running replay data. # If you want to run replay data- set a path name for train_data_server. # train: # loader_type: text # train_data: null # train_data_server: null # desired_max_samples: null test: # Test data configuration test_data: data/mlm_bert/test_data.txt task: mlm mlm_probability: 0.25 tokenizer_type_fast: False batch_size: 128 max_seq_length: 256 max_samples_per_user: 5000 mask_token: num_workers: 0 prepend_datapath: false cache_dir: ./cache_dir type: model_optimization # Server type aggregate_median: softmax # FL aggregation method weight_train_loss: train_loss # Determines how each client's weight is computed (e.g. grad_mean_loss, train_loss) softmax_beta: 1.00 initial_lr_client: 0.00001 lr_decay_factor: 1.0 best_model_criterion: loss # Determine the best model based on minimal loss, for checkpointing fall_back_to_best_model: false # If a model degrades, use the previous best model # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: meta_learning: basic stats_on_smooth_grad: true ignore_subtask: false copying_train_data: false do_profiling: false # Enables client-side training profiling data_config: train: # This is the main training data configuration list_of_train_data: data/mlm_bert/train_data.txt task: mlm mlm_probability: 0.25 tokenizer_type_fast: False batch_size: 24 max_seq_length: 256 min_words_per_utt: 5 desired_max_samples: 5000 mask_token: num_workers: 0 num_frames: 0 max_grad_norm: 15.0 prepend_datapath: false cache_dir: ./cache_dir pin_memory: true type: optimization meta_optimizer_config: lr: 0.01 type: adam optimizer_config: type: adamW weight_decay: 0.01 amsgrad: true annealing_config: type: step_lr step_interval: epoch step_size: 2 gamma: 1.0 ================================================ FILE: testing/hello_world_nlg_gru.yaml ================================================ # Basic configuration file for running nlg_gru example using json files. # Parameters needed to initialize the model model_config: model_type: GRU model_folder: experiments/nlg_gru/model.py embed_dim: 160 vocab_size: 10000 hidden_dim: 512 OOV_correct: false # Configuration for differential privacy dp_config: enable_local_dp: false # If enabled, the rest of parameters is needed. # Additional privacy metrics privacy_metrics_config: apply_metrics: false # If enabled, the rest of parameters is needed. # Select the Federated optimizer to use (e.g. DGA or FedAvg) strategy: DGA # Determines all the server-side settings for training and evaluation rounds server_config: wantRL: false # Enable/Disable Reinforcement learning resume_from_checkpoint: true # Resumes from latest checkpoint iteration if available do_profiling: false # Capture profiling information during server updates. optimizer_config: # Configuration for server-side optimizer type: adam lr: 0.003 amsgrad: true annealing_config: # This section configures how the learning rate decays type: step_lr step_interval: epoch gamma: 1.0 step_size: 100 val_freq: 1 # Frequency for validation rounds rec_freq: 5 # Frequency for testing rounds initial_val : true # Enable initial validation round at itr=0 initial_rec: false # Enable initial testing round at itr=0 max_iteration: 3 # Total number of rounds for FL num_clients_per_iteration: 10 # Number of clients sampled per round data_config: # Server-side data configuration val: # Validation data # batch_size: 2048 tokenizer_type: not_applicable prepend_datapath: false val_data: data/nlg_gru/val_data.json vocab_dict: models/vocab_reddit.vocab pin_memory: true num_workers: 0 # Indicates how many workers are used for creating batches num_frames: 2400 max_batch_size: 2048 max_num_words: 25 unsorted_batch: true test: # Test data configuration batch_size: 2048 tokenizer_type: not_applicable prepend_datapath: false train_data: null train_data_server: null test_data: data/nlg_gru/test_data.json vocab_dict: models/vocab_reddit.vocab pin_memory: true num_workers: 0 # Indicates how many workers are used for creating batches max_batch_size: 2048 max_num_words: 25 unsorted_batch: true type: model_optimization aggregate_median: softmax # FL aggregation method weight_train_loss: train_loss # Determines how each client's weight is computed (e.g. grad_mean_loss, train_loss) softmax_beta: 20.0 initial_lr_client: 1.0 lr_decay_factor: 1.0 best_model_criterion: loss # Determine the best model based on minimal loss, for checkpointing fall_back_to_best_model: false # If a model degrades, use the previous best model # Dictates the learning parameters for client-side model updates. Train data is defined inside this config. client_config: meta_learning: basic stats_on_smooth_grad: true ignore_subtask: false num_skips_threshold: 10 copying_train_data: false do_profiling: false # Enables client-side training profiling data_config: train: # This is the main training data configuration batch_size: 64 tokenizer_type: not_applicable prepend_datapath: false list_of_train_data: data/nlg_gru/train_data.json vocab_dict: models/vocab_reddit.vocab pin_memory: true num_workers: 0 desired_max_samples: 50000 max_grad_norm: 20.0 max_batch_size: 128 max_num_words: 25 unsorted_batch: true type: optimization meta_optimizer_config: lr: 1.0 type: sgd optimizer_config: type: sgd annealing_config: type: step_lr step_interval: epoch step_size: 1 gamma: 1.0 ================================================ FILE: testing/test_e2e_trainer.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import subprocess import os import platform import pytest xfail = pytest.mark.xfail def get_info(task): data_path=r'./testing/' output_path=r'./testing/outputs/' if task == 'nlg_gru': config_path=r'./testing/hello_world_nlg_gru.yaml' elif task == "classif_cnn": config_path=r'./testing/hello_world_classif_cnn.yaml' elif task == "ecg_cnn": config_path=r'./testing/hello_world_ecg_cnn.yaml' elif task == "mlm_bert": config_path=r'./testing/hello_world_mlm_bert.yaml' return data_path, output_path, config_path def run_pipeline(data_path, output_path, config_path, task): print("Testing {} task".format(task)) # Adjust command to the task and OS sym = "&" if platform.system() == "Windows" else ";" command = 'cd .. '+ sym +' python '+'-m '+'torch.distributed.run '+ '--nproc_per_node=2 '+ 'e2e_trainer.py '+ \ '-dataPath '+ data_path+' -outputPath '+output_path+' -config ' +config_path +\ ' -task '+ task + ' -backend '+ 'nccl' # Execute e2e_trainer + stores the exit code with open('logs.txt','w') as f: process= subprocess.run(command, shell=True,stdout=f,text=True,timeout=900) return_code=process.returncode # Print logs os.system("ls") os.system("less logs.txt") print(process.stderr) print("Finished running {} task".format(task)) return return_code def test_nlg_gru(): task = 'nlg_gru' data_path, output_path, config_path = get_info(task) assert run_pipeline(data_path, output_path, config_path, task)==0 def test_ecg_cnn(): task = 'ecg_cnn' data_path, output_path, config_path = get_info(task) assert run_pipeline(data_path, output_path, config_path, task)==0 @pytest.mark.xfail def test_mlm_bert(): task = 'mlm_bert' data_path, output_path, config_path = get_info(task) assert run_pipeline(data_path, output_path, config_path, task)==0 print("PASSED") @pytest.mark.xfail def test_classif_cnn(): task = 'classif_cnn' data_path, output_path, config_path = get_info(task) assert run_pipeline(data_path, output_path, config_path, task)==0 print("PASSED") ================================================ FILE: utils/__init__.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. from .utils import * from utils.optimizers.lars import * from utils.optimizers.lamb import * ================================================ FILE: utils/data_utils.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import random import logging from torch.utils.data import sampler from utils import AverageMeter class BatchSampler(sampler.Sampler): """ Simply determines the order in which the loader will read samples from the data set. We want to sample batches randomly, but each batch should have samples that are close to each other in the dataset (so that we don't have a lot of zero padding) """ def __init__(self, dataset, batch_size, randomize=True, drop_last=False): self.dataset = dataset self.batch_size = batch_size self.randomize=randomize batches = [range(begin_id, begin_id + batch_size) for begin_id in range(0, len(dataset), batch_size)] # if the indexes in the last batch are going over len(dataset), we drop the last batch. if batches[-1][-1] > len(dataset): if drop_last: del batches[-1] else: batches[-1]=range(batches[-1][0],len(dataset)) self.batches = batches def __iter__(self): if self.randomize: random.shuffle(self.batches) return iter(self.batches) def __len__(self): return len(self.batches) * self.batch_size class DynamicBatchSampler(sampler.Sampler): """Extension of Sampler that will do the following: 1. Change the batch size (essentially number of sequences) in a batch to ensure that the total number of frames are less than a certain threshold. 2. Make sure the padding efficiency in the batch is high. """ def __init__(self, sampler, frames_threshold, max_batch_size=0, unsorted_batch=False, fps= 1000 / 30): """ @sampler: will mostly be an instance of DistributedSampler. Though it should work with any sampler. @frames_threshold: maximum area of the batch """ self.sampler = sampler self.frames_threshold = frames_threshold self.max_batch_size = max_batch_size self.unsorted_batch = unsorted_batch indices, batches = list(), list() # the dataset to which these indices are pointing to dataset = self.sampler.dataset # get all the indices and corresponding durations from # the sampler for idx in self.sampler: indices.append((idx, dataset.utt_list[idx]["duration"])) # sort the indices according to duration if self.unsorted_batch is False: indices.sort(key=lambda elem : elem[1]) max_dur = indices[-1][1] else: # make sure that you will be able to serve all the utterances max_dur = max([indices[i][1] for i in range(len(indices))]) # start clubbing the utterances together batch = list() batch_frames, batch_area = 0, 0 max_frames_in_batch = 0 average_meter = AverageMeter('Padding Efficiency') for idx, duration in indices: if duration > 0: frames = duration * fps if frames > max_frames_in_batch: max_frames_in_batch = frames if (self.unsorted_batch and len(batch) < max_batch_size)\ or (not self.unsorted_batch and batch_frames + frames <= self.frames_threshold and (max_batch_size == 0 or len(batch) < max_batch_size)): batch.append(idx) batch_frames += frames batch_area = max_frames_in_batch * len(batch) else: # log the stats and add previous batch to batches if batch_area > 0 and len(batch) > 0: average_meter.add(batch_frames, batch_area) batches.append(batch) # make a new one batch = list() batch_frames, batch_area = frames, frames max_frames_in_batch = batch_frames # When all indices are processed if batch_area > 0 and len(batch) > 0: average_meter.add(batch_frames, batch_area) batches.append(batch) # don't need the 'indices' any more del indices self.batches = batches average_meter.display_results(loglevel=logging.DEBUG) def __iter__(self): # shuffle on a batch level random.shuffle(self.batches) return iter(self.batches) def __len__(self): return len(self.batches) ================================================ FILE: utils/dataloaders_utils.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import os import logging from importlib.machinery import SourceFileLoader from utils import print_rank def get_exp_dataloader(task): """ Detect the dataloader declared in the experiment folder Args: task (str): task parsed from the console """ try: dir = os.path.join('experiments',task,'dataloaders','dataloader.py') loader = SourceFileLoader("DataLoader",dir).load_module() loader = loader.DataLoader except: print_rank("Dataloader not found, please make sure is located inside the experiment folder") return loader def make_train_dataloader(data_config, data_path, clientx, task=None, vec_size=300, data_strct=None, replay_server=False): """ Create a dataloader for training on either server or client side """ mode = 'train' tokenizer_type= data_config.get('tokenizer_type', 'not_applicable') # Training list for a server if clientx is None: if not "train_data_server" in data_config or data_config["train_data_server"] is None: print_rank("No server training set is defined") return None my_data = os.path.join(data_path, data_config["train_data_server"]) mode='val' # Only for replay_server clientx = 0 # Only for replay_server # Training list on a client side else: if tokenizer_type != 'not_applicable': assert clientx >=0 and clientx < len(data_config["train_data"]), "Invalid client index {}".format(clientx) my_data = data_config["train_data"][clientx] else: my_data = data_config["list_of_train_data"] DataLoader = get_exp_dataloader(task) train_dataloader = DataLoader(data = data_strct if data_strct is not None else my_data, user_idx = clientx, mode = mode, args=data_config ) return train_dataloader def make_val_dataloader(data_config, data_path, task=None, data_strct=None, train_mode=False): """ Return a data loader for a validation set """ DataLoader = get_exp_dataloader(task) val_file = os.path.join(data_path, data_config["val_data"]) if data_config["val_data"] != None and data_path != None else None val_dataloader = DataLoader(data = data_strct if data_strct is not None else val_file, user_idx = 0, mode = 'val', args=data_config ) return val_dataloader def make_test_dataloader(data_config, data_path, task=None, data_strct=None): """ Return a data loader for an evaluation set. """ DataLoader = get_exp_dataloader(task) test_file = os.path.join(data_path, data_config["test_data"]) if data_config["test_data"] != None and data_path != None else None test_dataloader = DataLoader(data = data_strct if data_strct is not None else test_file, user_idx = 0, mode = 'test', args=data_config ) return test_dataloader def get_dataset(data_path, config, task, mode, test_only=False, user_idx=-1, data_strct=None): """ Return the task train/val/test dataset """ # Load Dataset Class data_config = get_data_config(config,mode) dir_ = os.path.join('experiments',task,'dataloaders','dataset.py') loader = SourceFileLoader("Dataset",dir_).load_module() dataset = loader.Dataset data_file = "val_data" if mode == "val" else "test_data" if mode == "test" else "list_of_train_data" data_file = data_config[data_file] data_pointer = os.path.join(data_path, data_file) if data_file != None else data_file return dataset(data_pointer if data_strct == None else data_strct, test_only=test_only, user_idx=user_idx, args=data_config) def get_data_config(config, mode): """ Return the configuration for the dataset""" if mode == 'val': data_config = config['server_config']['data_config']["val"] elif mode == 'test': data_config = config['server_config']['data_config']["test"] else: data_config = config["client_config"]["data_config"]["train"] semisupervision_config = config["client_config"].get('semisupervision',None) if semisupervision_config == None: return data_config else: return {** data_config, **semisupervision_config} ================================================ FILE: utils/optimizers/adamW.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import math import torch from torch.optim import Optimizer class AdamW(Optimizer): """ Implements Adam algorithm with weight decay fix. Parameters: lr (float): learning rate. Default 1e-3. betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999) eps (float): Adams epsilon. Default: 1e-6 weight_decay (float): Weight decay. Default: 0.0 correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True. """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True): if lr < 0.0: raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1])) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias) super(AdamW, self).__init__(params, defaults) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 # Decay the first and second moment running average coefficient # In-place operations to update the averages at the same time exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2) denom = exp_avg_sq.sqrt().add_(group['eps']) step_size = group['lr'] if group['correct_bias']: # No bias correction for Bert bias_correction1 = 1.0 - beta1 ** state['step'] bias_correction2 = 1.0 - beta2 ** state['step'] step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 p.data.addcdiv_(exp_avg, denom, value = -step_size) # Just adding the square of the weights to the loss function is *not* # the correct way of using L2 regularization/weight decay with Adam, # since that will interact with the m and v parameters in strange ways. # # Instead we want to decay the weights in a manner that doesn't interact # with the m/v parameters. This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. # Add weight decay at the end (fixed version) if group['weight_decay'] > 0.0: p.data.add_(p.data, alpha= -group['lr'] * group['weight_decay']) return loss ================================================ FILE: utils/optimizers/lamb.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. """Lamb optimizer.""" import collections import math import torch from torch.optim import Optimizer try: from tensorboardX import SummaryWriter def log_lamb_rs(optimizer: Optimizer, event_writer: SummaryWriter, token_count: int): """Log a histogram of trust ratio scalars in across layers.""" results = collections.defaultdict(list) for group in optimizer.param_groups: for p in group['params']: state = optimizer.state[p] for i in ('weight_norm', 'adam_norm', 'trust_ratio'): if i in state: results[i].append(state[i]) for k, v in results.items(): event_writer.add_histogram(f'lamb/{k}', torch.tensor(v), token_count) except ImportError: def log_lamb_rs(optimizer, event_writer, token_count): print("tensorboardX is not installed") class LAMB(Optimizer): r"""Implements Lamb algorithm. It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) adam (bool, optional): always use trust ratio = 1, which turns this into Adam. Useful for comparison purposes. .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes: https://arxiv.org/abs/1904.00962 """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0, adam=False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) self.adam = adam super(LAMB, self).__init__(params, defaults) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.') state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 # Decay the first and second moment running average coefficient # m_t exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # v_t exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) # Paper v3 does not use debiasing. # bias_correction1 = 1 - beta1 ** state['step'] # bias_correction2 = 1 - beta2 ** state['step'] # Apply bias to lr to avoid broadcast. step_size = group['lr'] # * math.sqrt(bias_correction2) / bias_correction1 weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10) adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps']) if group['weight_decay'] != 0: adam_step.add_(p.data, alpha=group['weight_decay']) adam_norm = adam_step.pow(2).sum().sqrt() if weight_norm == 0 or adam_norm == 0: trust_ratio = 1 else: trust_ratio = weight_norm / adam_norm state['weight_norm'] = weight_norm state['adam_norm'] = adam_norm state['trust_ratio'] = trust_ratio if self.adam: trust_ratio = 1 p.data.add_(adam_step, alpha=-step_size * trust_ratio) return loss ================================================ FILE: utils/optimizers/lars.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. """distoptim.hit package""" import logging import torch LOG = logging.getLogger(__name__) class LarsSGDV1(torch.optim.SGD): """ LARS SGD V1, based on https://arxiv.org/abs/1708.03888 2018. Refer to torch.optim.SGD for paramters. """ def __init__(self, params, lr, momentum=0, dampening=0, weight_decay=0, nesterov=False): LOG.info("Init LarsSGDV1") super(LarsSGDV1, self).__init__( params, lr, momentum, dampening, weight_decay, nesterov) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] # dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data p_n = p.data.norm() d_p_n = d_p.norm() if weight_decay != 0: d_p_n.add_(weight_decay, p_n) d_p.add_(weight_decay, p.data) alpha = 0.001 * p_n / d_p_n # This is the LARS eta from the paper lr = alpha * group['lr'] lr = min(lr, 5.0) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = \ torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(lr, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf p.data.add_(-1, d_p) return loss class LarsSGD(torch.optim.SGD): """ LARS SGD, based on https://arxiv.org/abs/1904.00962 Algorithm 1 2019, a newer version. Refer to torch.optim.SGD for paramters. """ def __init__(self, params, lr, momentum=0, dampening=0, weight_decay=0, nesterov=False): LOG.info("Init LarsSGD") super(LarsSGD, self).__init__( params, lr, momentum, dampening, weight_decay, nesterov) def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] # dampening = group['dampening'] nesterov = group['nesterov'] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if weight_decay != 0: d_p.add(p.data, alpha=weight_decay) if momentum != 0: param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = \ torch.clone(d_p).detach() else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(1 - momentum, d_p) if nesterov: d_p = d_p.add(buf, alpha=momentum) else: d_p = buf lr = group['lr'] * p.data.norm() / (d_p.norm() + 1e-8) lr.clamp_(0, 10) p.data.add_(d_p, alpha=-lr) return loss ================================================ FILE: utils/preprocessing/create-hdf5.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import h5py import time from tqdm import tqdm import pandas as pd path = r'C:\Users\train.tsv' def local_time(): return str(time.strftime("%H:%M:%S",time.localtime())) print(local_time() + " Starting script " ) columns = ['author','num1','content','str1','str2','num2','subreddit'] df = pd.read_csv(path, sep='\t', names=columns, header=None) print(local_time() + " File has been read " ) df_authors = pd.DataFrame(df['author']) df_content = pd.DataFrame(df['content']) df_file = pd.concat([df_authors,df_content], axis=1) print(local_time() + " Data needed has been concatenated ") users_group = df_file.groupby('author') group0 = df_file.groupby(['author','content']) group1 = pd.Series(users_group.size()) users = (group1.index).to_numpy() print(local_time() + " users been formatted ") num_samples = group1.values print(local_time() + " num_samples has been formatted ") user_data_dict= {} user_data_dict= {i: {'x':list()} for i in tqdm(users)} for i in tqdm(range(len(df_file))): if df_file['content'][i] not in user_data_dict[df_file['author'][i]]['x']: user_data_dict[df_file['author'][i]]['x'].append(df_file['content'][i]) print(local_time() + " user_data has been formatted ") f = h5py.File(r"C:\Users\train.hdf5", "w") dset_0 = f.create_dataset("num_samples",data=num_samples) dset_1= f.create_dataset("users", data =users) print(local_time() + " starting to store dictionary ") user_data = f.create_group("user_data") for user in tqdm(user_data_dict): user_group = user_data.create_group(user) user_data_dict[user]['x'] = [str(e).encode('utf8') for e in user_data_dict[user]['x']] x_dset = user_group.create_dataset('x',data=user_data_dict[user]['x']) print(local_time() + " end of script ") ================================================ FILE: utils/preprocessing/create-json.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import json import time from tqdm import tqdm import pandas as pd path = r'C:\Users\train.tsv' def local_time(): return str(time.strftime("%H:%M:%S",time.localtime())) print(local_time() + " Starting script " ) columns = ['author','num1','content','str1','str2','num2','subreddit'] df = pd.read_csv(path, sep='\t', names=columns, header=None) print(local_time() + " File has been read " ) df_authors = pd.DataFrame(df['author']) df_content = pd.DataFrame(df['content']) df_file = pd.concat([df_authors,df_content], axis=1) print(local_time() + " Data needed has been concatenated ") users_group = df_file.groupby('author') group0 = df_file.groupby(['author','content']) group1 = pd.Series(users_group.size()) users = (group1.index).to_numpy() print(local_time() + " users been formatted ") num_samples = group1.values print(local_time() + " num_samples has been formatted ") user_data_dict= {} user_data_dict= {i: {'x':list()} for i in tqdm(users)} for i in tqdm(range(len(df_file))): if df_file['content'][i] not in user_data_dict[df_file['author'][i]]['x']: user_data_dict[df_file['author'][i]]['x'].append(df_file['content'][i]) f = open(r'C:\Users\train.json', "w") new_data = {'users': users.tolist(), 'num_samples': num_samples.tolist(), 'user_data': user_data_dict} json.dump(new_data,f) print(local_time() + " end of script ") ================================================ FILE: utils/preprocessing/from_json_to_hdf5.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import json import h5py from tqdm import tqdm import time json_file = r'C:\Users\train.tsv' def local_time(): return str(time.strftime("%H:%M:%S",time.localtime())) print(local_time() + " Starting script " ) with open(json_file, 'r') as f: json_file = json.load(f) print(local_time() + " JSON file read " ) hdf_file = h5py.File(r"C:\Users\train.hdf5", "w") dset_0 = hdf_file.create_dataset("users",data=json_file['users']) dset_1 = hdf_file.create_dataset("num_samples",data=json_file['num_samples']) print(local_time() + " users and num_samples stored " ) user_data = hdf_file.create_group("user_data") for user in tqdm(json_file['user_data']): user_group = user_data.create_group(user) dset_2 = user_group.create_dataset('x',data=json_file['user_data'][user]['x']) print(local_time() + " end of script " ) ================================================ FILE: utils/utils.py ================================================ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import os import sys import numpy as np import logging import yaml import time import math import json import copy import io import pstats import functools import torch from collections import OrderedDict from utils.optimizers.lars import LarsSGD from utils.optimizers.lamb import LAMB from utils.optimizers.adamW import AdamW from easydict import EasyDict as edict from torch.optim.lr_scheduler import ( StepLR, MultiStepLR, ReduceLROnPlateau ) def make_optimizer(optimizer_config, model): """Initialization for optimizer.""" tmp_config = copy.deepcopy(optimizer_config) if optimizer_config["type"] == "sgd": tmp_config.pop("type", None) return torch.optim.SGD(model.parameters(), **tmp_config) elif optimizer_config["type"] == "adam": tmp_config.pop("type", None) return torch.optim.Adam(model.parameters(), **tmp_config) elif optimizer_config["type"] == "adamax": tmp_config.pop("type", None) tmp_config.pop("amsgrad", None) return torch.optim.Adamax(model.parameters(), **tmp_config) elif optimizer_config["type"] == "lars": tmp_config.pop("type", None) from torchlars import LARS base_optimizer = torch.optim.SGD(model.parameters(), **tmp_config) return LARS(optimizer=base_optimizer, eps=1e-8, trust_coef=0.001) elif optimizer_config["type"] == "LarsSGD": tmp_config.pop("type", None) return LarsSGD(model.parameters(),**tmp_config) elif optimizer_config["type"] == "lamb": tmp_config.pop("type", None) return LAMB(model.parameters(), **tmp_config) elif optimizer_config["type"] == "adamW": tmp_config.pop("type", None) tmp_config.pop("amsgrad", None) return AdamW(model.parameters(), **tmp_config) else: raise ValueError("{} optimizer not supported".format(optimizer_config["type"])) def get_lr(optimizer): """Obtain LR.""" for param_group in optimizer.param_groups: return param_group['lr'] def get_lr_all(optimizer): """Double checking for get_lr.""" for param_group in optimizer.param_groups: yield param_group['lr'] def softmax(X, theta = 1.0, axis = None): """Compute the softmax of each element along an axis of X. Args: X (ndarray): x, probably should be floats. theta (float): used as a multiplier prior to exponentiation. Default = 1.0 axis : axis to compute values along. Default is the first non-singleton axis. Returns: An array the same size as X. The result will sum to 1 along the specified axis. """ # make X at least 2d y = np.atleast_2d(X) # find axis if axis is None: axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1) # multiply y against the theta parameter, y = y * float(theta) # subtract the max for numerical stability y = y - np.expand_dims(np.max(y, axis = axis), axis) # exponentiate y y = np.exp(y) # take the sum along the specified axis ax_sum = np.expand_dims(np.sum(y, axis = axis), axis) # finally: divide elementwise p = y / ax_sum # flatten if X was 1D if len(X.shape) == 1: p = p.flatten() return p class AverageMeter(object): """ Will calculate running micro and macro averages for various (error/efficiency) rates. """ def __init__(self, metric_name): self.numerators, self.denominators = list(), list() self.metric_name = metric_name def add(self, top, bottom): self.numerators.append(top) self.denominators.append(bottom) def get_macro_average(self): scores = [float(self.numerators[i]) / self.denominators[i] \ for i in range(len(self.denominators))] return self.get_average(scores) def get_micro_average(self): return float(sum(self.numerators)) / sum(self.denominators) # accepts a list and returns average def get_average(self, l): return sum(l) / float(len(l)) def reset(self): self.numerators, self.denominators = list(), list() def display_results(self, loglevel=logging.INFO): print_rank("{} Macro average: {}".format(self.metric_name, self.get_macro_average()), loglevel) print_rank("{} Micro average: {}".format(self.metric_name, self.get_micro_average()), loglevel) def make_lr_scheduler(annealing_config, optimizer, num_batches=1): """Set learning rate scheduler.""" annealing_config = copy.deepcopy(annealing_config) annealing_type = annealing_config.pop("type") # per epoch or per iter step_interval='epoch' if "step_interval" in annealing_config: step_interval = annealing_config.pop("step_interval") if annealing_type == "step_lr": # convert epoch steps to iter steps # expochs can also be floats like 1.5 if step_interval == "epoch": annealing_config["step_size"] = int(num_batches * \ annealing_config["step_size"]) lr_scheduler = StepLR(optimizer=optimizer, **annealing_config) elif annealing_type == "multi_step_lr": # convert epoch steps to iter steps if step_interval == "epoch": annealing_config["milestones"] = [int(i * num_batches) for i in annealing_config["milestones"]] lr_scheduler = MultiStepLR(optimizer=optimizer, **annealing_config) elif annealing_type == "rampup-keep-expdecay-keep": # emulate SpecAugment scheduling lr_scheduler = RampupKeepExpdecayKeepLRScheduler(optimizer=optimizer, **annealing_config) elif annealing_type == 'val_loss': lr_scheduler = ReduceLROnPlateau(optimizer, **annealing_config) else: raise ValueError("{} LR scheduler not supported".format( annealing_type)) return lr_scheduler class RampupKeepExpdecayKeepLRScheduler(torch.optim.lr_scheduler._LRScheduler): """Implements the LR schedule described in the specaugment paper.""" def __init__(self, optimizer, peak_lr=0.001, floor_lr=0.00001, sr=1000, si=40000, sf=160000, last_epoch=-1): assert(peak_lr>=floor_lr) self.peak_lr = peak_lr self.floor_lr = floor_lr assert(sr<=si) assert(si<=sf) self.sr = sr self.si = si self.sf = sf self.gamma = math.log(self.floor_lr/self.peak_lr)/(float(self.sf-self.si)) print('self.gamma') print(self.gamma) self.step_count = 0 super(RampupKeepExpdecayKeepLRScheduler, self).__init__(optimizer, last_epoch=last_epoch) def step(self, epoch=None): for p, lr in zip(self.optimizer.param_groups, self.get_lr()): p['lr'] = lr self.step_count += 1 def get_lr(self): lr = self.floor_lr if self.step_count < self.sr: # linear ramp up lr = self.peak_lr * float(self.step_count) / float(self.sr) elif self.step_count < self.si: # keep peak_lr lr = self.peak_lr elif self.step_count < self.sf: # exponential decay from peak_lr to floor_lr lr = self.peak_lr * math.exp(self.gamma * (float(self.step_count-self.si))) return [lr for base_lr in self.base_lrs] class ScheduledSamplingScheduler(): """ Implementing the schedule sampling rate schedule. 0 - ramp_start = initial_rate ramp_start - ramp_end = {linearly increase to final_rate} ramp_end - infinity = final_rate """ def __init__(self, model, ramp_start, ramp_stop, initial_rate, final_rate): self.model = model self.ramp_start = ramp_start self.ramp_stop = ramp_stop self.initial_rate = initial_rate self.final_rate = final_rate self.iter = 0 def step(self): if self.iter < self.ramp_start: self.model.scheduled_sampling_rate = self.initial_rate elif self.iter >= self.ramp_start and self.iter <= self.ramp_stop: self.model.scheduled_sampling_rate = self.initial_rate + (self.final_rate - self.initial_rate) * ( (self.iter - self.ramp_start) / (self.ramp_stop - self.ramp_start)) else: self.model.scheduled_sampling_rate = self.final_rate self.model.scheduled_sampling = (self.model.scheduled_sampling_rate != 0) self.iter += 1 def state_dict(self): return {key: value for key, value in self.__dict__.items() if key != 'model'} def load_state_dict(self, state_dict): self.__dict__.update(state_dict) class NBestTaskScheduler(): """ Implementing the scheduler for multi-task training. num_tasks[0]: 0 <= i < iteration_per_task[0] num_tasks[1]: iteration_per_task[0] <= i < iteration_per_task[1] """ def __init__(self, num_tasks, iteration_per_task): assert len(num_tasks) == len(iteration_per_task), "Mismatched length {}!={}".format(len(num_tasks), len(iteration_per_task)) self.iter = 0 self.stagex = 0 self.num_tasks = num_tasks self.iteration_per_task = iteration_per_task def current_num_tasks(self): return self.num_tasks[self.stagex] def no_label_updates(self): """Return how many times transcription must be updated.""" return (self.iter // self.iteration_per_task[-1]) + 1 def set_iteration_no(self, iter_no): self.iter = iter_no def step(self): print_rank("Iter={}: #tasks {} at stage {}".format(self.iter, self.current_num_tasks(), self.stagex)) local_iter = self.iter % self.iteration_per_task[-1] if local_iter == 0: self.stagex = 0 elif local_iter >= self.iteration_per_task[self.stagex]: self.stagex += 1 self.iter += 1 # Logging and write-to-disk utilities def init_logging(log_dir, loglevel=logging.DEBUG): """Initialize logging""" os.makedirs(log_dir, exist_ok=True) log_file = os.path.join(log_dir, "log.out") logging.basicConfig(filename=log_file, level=loglevel) handler = logging.StreamHandler(stream=sys.stdout) logging.getLogger().addHandler(handler) def print_cuda_stats(): if torch.cuda.is_available(): print_rank("torch.cuda.memory_allocated(): {}".format(torch.cuda.memory_allocated())) print_rank("torch.cuda.memory_cached(): {}".format(torch.cuda.memory_cached())) print_rank("torch.cuda.synchronize(): {}".format(torch.cuda.synchronize())) else: print_rank("No CUDA GPU available") def print_rank(str, loglevel=logging.INFO): str = "{} : {}".format(time.ctime(), str) logging.log(loglevel, str) def print_profiler(profiler, loglevel=logging.INFO): memfile = io.StringIO() pstats.Stats(profiler, stream=memfile) \ .strip_dirs() \ .sort_stats(pstats.SortKey.CUMULATIVE) \ .print_stats(20) for l in memfile.getvalue().split('\n'): print_rank(l, loglevel=loglevel) memfile.close() def write_yaml(save_path, config): with open(save_path, 'w', encoding='utf8') as yaml_file: yaml.dump(config, yaml_file, default_flow_style=False) def torch_save(save_path, state_or_model): torch.save(state_or_model, save_path) def write_tokens(save_path, token_list): with open(save_path, 'w', encoding='utf8') as token_fid: for w in token_list: token_fid.write(w + '\n') def try_except_save(save_fn, **kwargs): """ Try to write it out 3 times.""" max_attempts = 3 for attempt in range(1, max_attempts+1): try: save_fn(**kwargs) except IOError: print_rank("Write operation failed on {} attempt".format(attempt)) else: print_rank("Write operation succeeded in {} attempts".format(attempt)) return def write_nbest_jsonl(uttid2jsonl, uttid2hypos, uttid2scores, outputpath, nbest, orgpath="", newpath=""): """ Dump a json list file with n-best hypos.""" newjsonl = [] for uttid, jsonl in uttid2jsonl.items(): if not uttid in uttid2hypos: print("Missing utterance {} in results".format(uttid)) continue hypos = uttid2hypos[uttid] if nbest > 1: # re-normalize the probablity from N-best: ignoring the events out of the N-best hypos weights = uttid2scores[uttid] if len(weights) < nbest: for n in range(len(weights), nbest): print_rank("Mising {}-th best result in {}. Appending {}".format(n, uttid, weights[0])) weights = np.append(weights, np.array(weights[0])) weights = softmax(weights[0:nbest]) if uttid in uttid2scores else np.ones(nbest) / nbest # Filling the missing hypos with the 1st best candidate for n in range(min(nbest, len(hypos))): newjson = copy.deepcopy(jsonl) newjson["id"] = "{}-{}".format(uttid, n) newjson["text"] = " ".join(hypos[n]) newjson["loss_weight"] = weights[n] else: newjson = copy.deepcopy(jsonl) newjson["id"] = uttid newjson["text"] = " ".join(hypos[0]) newjsonl.append(newjson) with open(outputpath, 'w') as ofp: for jsonl in newjsonl: jsonl["wav"] = jsonl["wav"].replace(orgpath, newpath) ofp.write("{}\n".format(json.dumps(jsonl))) return True def write_multitask_jsonl(uttid2jsonl, uttid2hypos, uttid2scores, outputpath, nbest, orgpath="", newpath=""): """ Dump a json list file with n-best hypos.""" if nbest==1: return write_nbest_jsonl(uttid2jsonl, uttid2hypos, uttid2scores, outputpath, nbest, orgpath, newpath) newjsonl = [] for uttid, jsonl in uttid2jsonl.items(): if not uttid in uttid2hypos: print_rank("Missing utterance {} in results".format(uttid)) continue hypos = uttid2hypos[uttid] # re-normalize the probablity from N-best: ignoring the events out of the N-best hypos weights = uttid2scores[uttid] if len(weights) < nbest: for n in range(len(weights), nbest): print_rank("Mising {}-th best result in {}. Appending {}".format(n, uttid, weights[0])) weights = np.append(weights, np.array(weights[0])) weights = softmax(weights[0:nbest]) if uttid in uttid2scores else np.ones(nbest) / nbest newjson = jsonl newjson["task_weights"] = weights.tolist() assert len(weights) == nbest, "{}: Weight length does not match: {} != {}".format(uttid, len(weights), nbest) newjson["text"] = " ".join(hypos[0]) newjson["subtextl"] = [] all_null_results = newjson["text"] == "" for n in range(1, nbest): if n < len(hypos): newjson["subtextl"].append(" ".join(hypos[n])) else: print_rank("Mising {}-th best result in {}".format(n, uttid)) newjson["subtextl"].append(" ".join(hypos[0])) if all_null_results is True: all_null_results = newjson["subtextl"][n-1] == "" assert len(newjson["subtextl"]) == nbest-1, "#sub-rec results does not match: {} != {}".format(len(newjson["subtextl"]), nbest-1) # take meaningful results only and ignore null string if all_null_results is False: newjsonl.append(newjson) else: print_rank("Skip {}: Invalid result '{}'".format(uttid, newjson["text"])) with open(outputpath, 'w') as ofp: for jsonl in newjsonl: jsonl["wav"] = jsonl["wav"].replace(orgpath, newpath) ofp.write("{}\n".format(json.dumps(jsonl))) return True def load_eval_result_jsonl(resultjsonl, uttid2hypos=OrderedDict(), uttid2scores=OrderedDict(), dumpfp=None, dump_msg="RESULT: "): """Load the result JSON list file dumped by Evaluator(). Args: resultjsonl (str): input JSON list file uttid2hypos: (dict): maps the utterance ID to text, [uttid] = hypothesis text uttid2scores (dict): maps the utterance ID to a confidence score, [uttid] = confidence score(s) dumpfp (file): pointer where the WERs will be written out dump_msg (str): message string before the WER result """ total_weighted_best_wer = 0 total_weighted_oracle_wer = 0 total_length = 0 with open(resultjsonl) as resultfp: for line in resultfp: elems = json.loads(line.strip()) if "hypothesis" in elems: uttid = elems["utt_id"] params = list(elems["hypothesis"].keys()) uttid2hypos[uttid] = elems["hypothesis"][params[0]] if "nbest_model_scores" in elems: uttid2scores[uttid] = np.array(elems["nbest_model_scores"][params[0]]) else: print_rank("Result: {}".format(line.strip())) if dumpfp is not None: dumpfp.write("{}{}\n".format(dump_msg, line.strip())) params = list(elems["wer-"].keys()) total_weighted_best_wer += elems["wer-"][params[0]]["best_wer"] * elems["wer-"][params[0]]["total_length"] total_weighted_oracle_wer += elems["wer-"][params[0]]["oracle_wer"] * elems["wer-"][params[0]]["total_length"] total_length += elems["wer-"][params[0]]["total_length"] return uttid2hypos, uttid2scores, total_weighted_best_wer, total_weighted_oracle_wer, total_length def find_pretrained_model(model_path, config): """"Load a a pre-trained/seed model if provided in config file.""" output_file=None if config.get("pretrained_model_path", None): output_file=config["pretrained_model_path"] print_rank('Loading Model from: {}'.format(output_file), loglevel=logging.INFO) return output_file def flatten_grads_model(learner) -> np.ndarray: """Given a model flatten all params and return as np array.""" return np.concatenate([w.grad.detach().clone().cpu().numpy().flatten() for w in learner.parameters()]) def flatten_grads_array(param_array)->np.array: """Given a model flatten all params and return as np array.""" N=len(param_array) tmp_array=[] for i in range(N): tmp_array.append(np.concatenate([w.detach().clone().cpu().numpy().flatten() for w in param_array[i]])) return np.array(tmp_array) def dist_weights_to_model(weights, parameters): """Updates the model parameters with the supplied weights.""" offset = 0 for param in parameters: new_size = functools.reduce(lambda x, y: x*y, param.shape) current_data = weights[offset:offset + new_size] param.data[:] = torch.from_numpy(current_data.reshape(param.shape)).to(param.data) offset += new_size def dist_params_to_model(grads, model): """Updates the model gradients (Corresponding to each param) with the supplied grads.""" offset = 0 for p in model: new_size = functools.reduce(lambda x, y: x*y, p.data.shape) current_data = torch.from_numpy(grads[offset:offset + new_size].reshape(p.data.shape)).type(p.data.dtype).to(p) p.grad = current_data if p.grad==None else p.grad+current_data offset += new_size def reshape_params_to_model(grads, model): """ Given Gradients and a model architecture this method updates the model gradients (Corresponding to each param) with the supplied grads """ offset = 0 reshaped_grads=[] for p in model: new_size = functools.reduce(lambda x, y: x*y, p.shape) current_data = torch.from_numpy(grads[offset:offset + new_size].reshape(p.shape)).type(p.dtype).to(p) reshaped_grads.append(current_data) offset += new_size return reshaped_grads def to_device(x): return x.cuda() if torch.cuda.is_available() else x def update_json_log(log_path, status_info): """Update J-son elements""" elems = {} if os.path.exists(log_path): with open(log_path, 'r') as logfp: elems = json.load(logfp) print_rank("Loaded status info: {}".format(elems)) for k, v in status_info.items(): elems[k] = v with open(log_path, 'w') as logfp: json.dump(elems, logfp) print_rank("Updated status info: {}".format(elems)) def scrub_empty_clients(data_strct): """ Clean empty clients in the data structure""" users_out = [] user_data_out = {} num_samples_out = [] if 'user_data_label' in data_strct.keys(): user_data_label_out = {} for ix, user in enumerate(data_strct['users']): if data_strct['num_samples'][ix] > 0: users_out.append(user) user_data_out[user] = data_strct['user_data'][user] num_samples_out.append(data_strct['num_samples'][ix]) if 'user_data_label' in data_strct.keys(): user_data_label_out[user] = data_strct['user_data_label'][user] if ('user_data_label' in data_strct.keys()): return edict({'users': users_out, 'user_data': user_data_out, 'num_samples': num_samples_out, 'user_data_label': user_data_label_out}) else: return edict({'users': users_out, 'user_data': user_data_out, 'num_samples': num_samples_out}) def compute_grad_cosines(grads, model_grad): def compute_cosine(g, m): tot = 0 g2 = 0 m2 = 0 for p1, p2 in zip(g, m): tot += torch.mul(p1, p2.to('cpu')).sum().item() g2 += torch.mul(p1, p1).sum().item() m2 += torch.mul(p2, p2).sum().item() return tot / (np.sqrt(g2) * np.sqrt(m2)) if g2 > 0 and m2 > 0 else 0 return [compute_cosine(g, model_grad) for g in grads] # Personalization Routines def convex_inference(model_global, model_personal, alpha): """" Model interpolation """ targets= torch.tensor(model_global['labels']) probs = alpha*model_personal['probabilities']+(1-alpha)*model_global['probabilities'] probs= torch.argmax(torch.tensor(probs), dim=1) return torch.mean((probs == targets).float()).detach().cpu().item() def alpha_update(model_global, model_personal, alpha, eta): """" Training convex model interpolation weight. """ grad_alpha = 0.0 for l_params, p_params in zip(model_global.parameters(), model_personal.parameters()): dif = p_params.data - l_params.data grad = alpha * p_params.grad + (1 - alpha) * l_params.grad grad_alpha += dif.view(-1).T.dot(grad.view(-1)) grad_alpha += 0.02 * alpha alpha_n = alpha - eta * grad_alpha alpha_n = np.clip(alpha_n.detach().cpu().item(), 0.0001, 0.9999) return alpha_n if np.isfinite(alpha_n) else 0.75 # Semi-supervision Routines def get_label_VAT(local_logits, server_logits, thre, comp): """" Returns the estimated labels to SemiSupervision Task """ bs = np.shape(local_logits)[0] logit_dim = np.shape(local_logits)[1] labels = [] idx = [] var = [] if comp == 'var': local_var = torch.var(local_logits, dim=1) server_var = torch.var(server_logits, dim=1) server = 0 local = 0 ratio = 0 for bs_i in range(bs): if local_var[bs_i] >= server_var[bs_i] and torch.max(local_logits[bs_i]) > thre: labels.append(torch.argmax(local_logits[bs_i])) idx.append(bs_i) var.append((server_var[bs_i]) / (local_var[bs_i])) local += 1 if local_var[bs_i] < server_var[bs_i] and torch.max(server_logits[bs_i]) > thre: labels.append(torch.argmax(server_logits[bs_i])) idx.append(bs_i) var.append((local_var[bs_i]) / (server_var[bs_i])) server += 1 if len(labels) != 0: labels = torch.stack(labels) var = torch.stack(var) ratio = server / (server + local) elif comp == 'ent': local_var = scipyst.entropy(local_logits.cpu(), axis=1)+0.00001 server_var = scipyst.entropy(server_logits.cpu(), axis=1)+0.00001 server = 0 local = 0 ratio = 0 for bs_i in range(bs): if 1/local_var[bs_i]>= 1/server_var[bs_i] and torch.max(local_logits[bs_i])>thre: labels.append(torch.argmax(local_logits[bs_i])) idx.append(bs_i) var.append((1/server_var[bs_i])/(1/local_var[bs_i])) local += 1 if 1/local_var[bs_i]< 1/server_var[bs_i] and torch.max(server_logits[bs_i])>thre: labels.append(torch.argmax(server_logits[bs_i])) idx.append(bs_i) var.append((1/local_var[bs_i])/(1/server_var[bs_i])) server += 1 if len(labels) != 0: labels = torch.stack(labels) #var = torch.stack(var) ratio = server/(server+local) return labels, idx, var, ratio