Repository: microsoft/msrflute
Branch: main
Commit: 8bfe0854ab29
Files: 151
Total size: 775.6 KB

Directory structure:
gitextract_qg20kqyy/

├── .flake8
├── .github/
│   └── workflows/
│       ├── build_docs.yml
│       └── codeql.yml
├── .gitignore
├── .gitmodules
├── CHANGELOG.md
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.TXT
├── NOTICE.txt
├── README.md
├── SECURITY.md
├── azure-pipelines.yml
├── configs/
│   ├── hello_world_mlm_bert_json.yaml
│   └── hello_world_nlg_gru_json.yaml
├── core/
│   ├── __init__.py
│   ├── client.py
│   ├── config.py
│   ├── dataloader.py
│   ├── dataset.py
│   ├── evaluation.py
│   ├── federated.py
│   ├── metrics.py
│   ├── model.py
│   ├── schema.py
│   ├── server.py
│   ├── strategies/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── dga.py
│   │   ├── fedavg.py
│   │   ├── fedlabels.py
│   │   └── utils.py
│   └── trainer.py
├── doc/
│   └── sphinx/
│       ├── Makefile
│       ├── advanced.rst
│       ├── class_reference.rst
│       ├── conf.py
│       ├── index.rst
│       ├── launch.rst
│       ├── make.bat
│       ├── overview.rst
│       ├── reference.rst
│       ├── requirements.txt
│       └── scenarios.rst
├── e2e_trainer.py
├── experiments/
│   ├── __init__.py
│   ├── classif_cnn/
│   │   ├── .gitignore
│   │   ├── README.md
│   │   ├── config.yaml
│   │   ├── dataloaders/
│   │   │   ├── cifar_dataset.py
│   │   │   ├── dataloader.py
│   │   │   └── dataset.py
│   │   ├── model.py
│   │   └── utils/
│   │       ├── centralized_training.py
│   │       └── download_and_convert_data.py
│   ├── cv/
│   │   ├── README.md
│   │   ├── config.yaml
│   │   ├── data.py
│   │   ├── dataloaders/
│   │   │   ├── dataloader.py
│   │   │   └── dataset.py
│   │   ├── model.py
│   │   ├── model_vgg.py
│   │   └── server.py
│   ├── cv_cnn_femnist/
│   │   ├── README.md
│   │   ├── config.yaml
│   │   ├── dataloaders/
│   │   │   ├── dataloader.py
│   │   │   ├── dataset.py
│   │   │   └── preprocess.py
│   │   └── model.py
│   ├── cv_lr_mnist/
│   │   ├── README.md
│   │   ├── config.yaml
│   │   ├── dataloaders/
│   │   │   ├── dataloader.py
│   │   │   ├── dataset.py
│   │   │   └── preprocessing.py
│   │   └── model.py
│   ├── cv_resnet_fedcifar100/
│   │   ├── README.md
│   │   ├── config.yaml
│   │   ├── dataloaders/
│   │   │   ├── dataloader.py
│   │   │   ├── dataset.py
│   │   │   └── preprocessing.py
│   │   ├── group_normalization.py
│   │   └── model.py
│   ├── ecg_cnn/
│   │   ├── .gitignore
│   │   ├── centralized_model.ipynb
│   │   ├── config.yaml
│   │   ├── dataloaders/
│   │   │   ├── dataloader.py
│   │   │   └── dataset.py
│   │   ├── model.py
│   │   ├── readme.md
│   │   └── utils/
│   │       └── preprocess.py
│   ├── fednewsrec/
│   │   ├── README.md
│   │   ├── config.yaml
│   │   ├── dataloaders/
│   │   │   ├── dataloader.py
│   │   │   ├── dataset.py
│   │   │   └── preprocess_mind.py
│   │   ├── fednewsrec_model.py
│   │   ├── model.py
│   │   └── utils.py
│   ├── mlm_bert/
│   │   ├── README.md
│   │   ├── config.py
│   │   ├── dataloaders/
│   │   │   ├── dataloader.py
│   │   │   └── dataset.py
│   │   ├── model.py
│   │   └── utils/
│   │       ├── trainer_pt_utils.py
│   │       └── trainer_utils.py
│   ├── nlg_gru/
│   │   ├── README.md
│   │   ├── config.py
│   │   ├── dataloaders/
│   │   │   ├── dataloader.py
│   │   │   └── dataset.py
│   │   ├── model.py
│   │   └── utils/
│   │       └── utility.py
│   ├── nlp_rnn_fedshakespeare/
│   │   ├── README.md
│   │   ├── config.yaml
│   │   ├── dataloaders/
│   │   │   ├── dataloader.py
│   │   │   ├── dataset.py
│   │   │   └── preprocessing.py
│   │   └── model.py
│   └── semisupervision/
│       ├── README.md
│       ├── config.yaml
│       ├── dataloaders/
│       │   ├── RandAugment.py
│       │   ├── cifar_dataset.py
│       │   ├── dataloader.py
│       │   └── dataset.py
│       └── model.py
├── extensions/
│   ├── RL/
│   │   └── RL.py
│   ├── __init__.py
│   ├── privacy/
│   │   ├── __init__.py
│   │   ├── analysis.py
│   │   ├── dp_kmeans.py
│   │   └── metrics.py
│   └── quantization/
│       └── quant.py
├── requirements.txt
├── testing/
│   ├── README.md
│   ├── build_vocab.py
│   ├── create_data.py
│   ├── hello_world_classif_cnn.yaml
│   ├── hello_world_ecg_cnn.yaml
│   ├── hello_world_mlm_bert.yaml
│   ├── hello_world_nlg_gru.yaml
│   └── test_e2e_trainer.py
└── utils/
    ├── __init__.py
    ├── data_utils.py
    ├── dataloaders_utils.py
    ├── optimizers/
    │   ├── adamW.py
    │   ├── lamb.py
    │   └── lars.py
    ├── preprocessing/
    │   ├── create-hdf5.py
    │   ├── create-json.py
    │   └── from_json_to_hdf5.py
    └── utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .flake8
================================================
[flake8]
ignore = E501

================================================
FILE: .github/workflows/build_docs.yml
================================================
name: Build docs

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]

  workflow_dispatch:

jobs:
  build:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v2
      
      - name: Sphinx build
        uses: ammaraskar/sphinx-action@0.4
        with:
          docs-folder: doc/sphinx/

      - name: Commit documentation changes
        run: |
          git clone https://github.com/microsoft/msrflute --branch gh-pages --single-branch gh-pages
          cp -r doc/sphinx/_build/html/* gh-pages/
          cd gh-pages
          git config --local user.email "action@github.com"
          git config --local user.name "GitHub Action"
          git add .
          git commit -m "Update documentation" -a || true
    
      - name: Push changes
        uses: ad-m/github-push-action@master
        with:
          branch: gh-pages
          directory: gh-pages
          github_token: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .github/workflows/codeql.yml
================================================
# This is based on the standard CodeQL workflow provided by Github
name: "CodeQL"

on:
  push:
    branches: [ "main" ]
  pull_request:
    # The branches below must be a subset of the branches above
    branches: [ "main" ]
  schedule:
    - cron: '35 2 * * 3'

jobs:
  analyze:
    name: Analyze
    runs-on: ubuntu-latest
    permissions:
      actions: read
      contents: read
      security-events: write

    strategy:
      fail-fast: false
      matrix:
        language: [ 'python' ]

    steps:
    - name: Checkout repository
      uses: actions/checkout@v3

    - name: Set-up MPI
      uses: mpi4py/setup-mpi@v1

    # Initializes the CodeQL tools for scanning.
    - name: Initialize CodeQL
      uses: github/codeql-action/init@v2
      with:
        languages: ${{ matrix.language }}
        
    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
    # If this step fails, then you should remove it and run the build manually (see below)
    - name: Autobuild
      uses: github/codeql-action/autobuild@v2

    - name: Perform CodeQL Analysis
      uses: github/codeql-action/analyze@v2


================================================
FILE: .gitignore
================================================
__pycache__/
.vscode/
doc/sphinx/_build
testing/logs.txt
testing/outputs
testing/mockup

================================================
FILE: .gitmodules
================================================
[submodule "utils/dp-accountant"]
	path = utils/dp-accountant
	url = https://github.com/microsoft/prv_accountant


================================================
FILE: CHANGELOG.md
================================================
# Changelog

All notable changes to this project will be documented in this file.

## [0.1.0] - 2021-11-22

We're super excited to announce FLUTE: Federated Learning Utilities for Testing and Experimentation, a platform for conducting high-performance federated learning simulations!

This first release fully focuses on implementing fast prototyping to validate different CL scenarios 
in an Federated environment.

### Features

- large scale simulation (millions of clients, sampling tens of thousands per round).
- multi-GPU and multi-node orchestration backed up by MPI.
- local or global differential privacy.
- model quantization.
- a variety of standard optimizers and aggregation methods.
- most model types including CNNs, RNNs, and Huggingface Transformers.
- extensibility, enabling new models, dataloaders, optimizers, and aggregators.
- local or cloud-based job staging using AzureML.


## [1.0.0] - 2022-08-29

This release contain major changes in the communication backbone , in order
to run previous experiments you have already integrated in FLUTE, please make sure
to use `torch.distributed` instead of `MPI `to launch the jobs. For more documentation
about the new command, please refer to the [README](README.md).


### New features

- 🏎 Better performance: Support for NCCL and Gloo as backend communication protocols. 
  - Improvements in GPU utilization and overall communication speed (on the order of minutes!) for projects with huge models and datasets.
- 🌟 Remove file type dependency on client.py, now FLUTE can receive any kind of dataset and even download the data on-the-fly. The data intantiation is completely under control of each task dataset.
  - In older versions FLUTE only allowed `json` and `hdf5` files, so the client could recognize it.
- 🌟 Abstract classes for new models/dataloaders.
- 🌟 Allows Federated Learning with Personalization. 
  - Personalization allows you to leverage each client local data to obtain models that are better adjusted to their own data distribution. You can run the `cv` task in order to try out this feature.


## [1.0.1] - 2023-07-29

🔋 This release removes the restriction of the minimum number of GPUs available in FLUTE, 
allowing users to run experiments using a single-GPU worker by instantiating both: Server
and clients on the same device. For more documentation about how to run an experiments
using a single GPU, please refer to the [README](README.md).


### New features

- 🌟 Include FedProx aggregation method


================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
message: "To cite Microsoft FLUTE in academic papers, please cite it as below."
authors:
  - name: "Microsoft Research"
title: "FLUTE: Federated Learning Utilities for Testing and Experimentation"
version: 1.0.0
date-released: "2021-22-11"
url: "https://github.com/microsoft/msrflute"
license:
 - MIT
keywords:
  - FLUTE
  - federated learning


================================================
FILE: CODE_OF_CONDUCT.md
================================================
# Microsoft Open Source Code of Conduct

This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).

Resources:

- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing

This project welcomes contributions and suggestions. Most contributions require you to
agree to a Contributor License Agreement (CLA) declaring that you have the right to,
and actually do, grant us the rights to use your contribution. For details, visit
https://cla.microsoft.com.

This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

### Pull Requests

Submit pull requests to **branch contribution**. PR's in any other branch will not be accepted.

When you submit a pull request, a CLA-bot will automatically determine whether you need
to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
instructions provided by the bot. You will only need to do this once across all repositories using our CLA.


================================================
FILE: LICENSE.TXT
================================================
Copyright (c) Microsoft Corporation.

MIT License

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

================================================
FILE: NOTICE.txt
================================================
THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
Do Not Translate or Localize

This software incorporates components from the projects listed below. The original copyright notices
and the licenses under which Microsoft received such components are set forth below and are provided for 
informational purposes only. Microsoft reserves all rights not expressly granted herein, whether by 
implication, estoppel or otherwise.

This software includes parts of the Huggingface/Transformers Library (https://github.com/huggingface/transformers). 
State-of-the-art of  Natural Language Processing for Jax, PyTorch and TensorFlow. Huggingface/Transformers library is 
licensed under Apache License 2.0, you can find a copy of this license at https://github.com/huggingface/transformers/blob/master/LICENSE

This software includes parts of the Tensorflow/Privacy Library (https://github.com/tensorflow/privacy). 
A library that includes implementations of TensorFlow optimizers for training machine learning models with
differential privacy. The Tensorflow/Privacy library is licensed under Apache License 2.0, 
you can find a copy of this license at https://github.com/tensorflow/privacy/blob/master/LICENSE

This software includes parts of LEAF Library (https://github.com/TalwalkarLab/leaf).
A Benchmark for Federated Settings. LEAF library is licensed under BSD 2-Clause License, you can find a copy
of this license at https://github.com/TalwalkarLab/leaf/blob/master/LICENSE.md

This software includes parts of ECG Classification from Kaggle Competition 
(https://www.kaggle.com/polomarco/ecg-classification-cnn-lstm-attention-mechanism). 
An example for ECG Classification | CNN LSTM Attention Mechanism. This example is 
licensed under Apache License 2.0, you can find a copy of this license at 
https://www.apache.org/licenses/LICENSE-2.0 

This software includes parts of Torchvision Library (https://github.com/pytorch/vision.git). A package of
popular datasets, model architectures, and common image transformations for computer vision. This example
is licenced under BSD 3-Clause License, you can find a copy of this licence at 
https://github.com/pytorch/vision/blob/main/LICENSE

This software includes parts of FedML Library (https://github.com/FedML-AI/FedML).The Community 
Building Open and Collaborative AI Anywhere at Any Scale. FedML library is licensed under Apache License 2.0, 
you can find a copy of this license at https://github.com/FedML-AI/FedML/blob/master/LICENSE

This software includes parts of FedNewsRec-EMNLP-Findings-2020 repository (https://github.com/taoqi98/FedNewsRec).  
Code from the paper "Privacy-Preserving News Recommendation Model Learning". This example is licenced 
under MIT License, you can find a copy of this licence at https://github.com/taoqi98/FedNewsRec/blob/master/LICENSE

This software includes parts of Fast AutoAugment repository (https://github.com/kakaobrain/fast-autoaugment).  
Code from the paper "Fast AutoAugment" (Accepted at NeurIPS 2019). This example is licenced 
under MIT License, you can find a copy of this licence at https://github.com/kakaobrain/fast-autoaugment/blob/master/LICENSE

This software includes parts of NIID-Bench repository (https://github.com/Xtra-Computing/NIID-Bench).  
Code from the paper "Federated Learning on Non-IID Data Silos: An Experimental Study". This example is 
licenced under MIT License, you can find a copy of this licence at https://github.com/Xtra-Computing/NIID-Bench/blob/main/LICENSE


================================================
FILE: README.md
================================================
# FLUTE

Welcome to FLUTE (Federated Learning Utilities for Testing and Experimentation), a platform for conducting high-performance federated learning simulations.

## Features

FLUTE is a pytorch-based orchestration environment enabling GPU or CPU-based FL simulations.  The primary goal of FLUTE is to enable researchers to rapidly prototype and validate their ideas.  Features include:

- large scale simulation (millions of clients, sampling tens of thousands per round)
- single/multi GPU and multi-node orchestration
- local or global differential privacy
- model quantization
- a variety of standard optimizers and aggregation methods
- most model types including CNNs, RNNs, and Huggingface Transformers.
- extensibility, enabling new models, dataloaders, optimizers, and aggregators.
- local or cloud-based job staging using AzureML

## Benchmarking

The following common tasks were used to evaluate the performance in speed/memory utilization of FLUTE compared with the most representative simulation platforms based on their number of starts on GitHub: FedML 0.7.303 and Flower 1.0.0. 

|Task|Data Set|Model|Algorithm|# Clients|Clients per round|Batch Size|Client Optimizer|lr|Epochs|# Rounds|Test Freq|
|:----|:----|:----|:----|:----|:----|:----|:----|:----|:----|:----|:----|
|CV|MNIST|LR|FedAvg|1000|10|10|SGD|0.03|1|100|20|
|CV|Federated EMNIST|CNN (2 Conv + 2 FC)|FedAvg|3400|10|20|SGD|0.1|1|1500|50|
|CV|FED_CIFAR-100|ResNet-18+group normalization|FedAvg|500|10|20|SGD|0.1|1|4000|50|
|NLP|Shakespeare|RNN (2 LSTM + 1 FC)|FedAvg|715|10|4|SGD|0.8|1|1200|50|

### FedML Comparison

This comparison was carried out using Parrot (Simulator) on version 0.7.303 at commit ID [8f7f261f](https://github.com/FedML-AI/FedML/tree/8f7f261f44e58d0cb5a416b0d6fa270b42a91049). Showing that in some cases FLUTE can outperform 43x faster.

```
 _____________________________________________________________________________
|                    |   FedML (MPI) - Fastest   |   FLUTE (NCCL)  - Fastest  |
| Task               | Acc | Time     | GPU Mem  | Acc | Time     | GPU Mem   |
|--------------------|-----|----------|----------|-----|----------|-----------|
| LR_MNIST           | ~81 | 00:03:09 | ~3060 MB | ~81 | 00:01:35 | ~1060 MB  |
| CNN_FEMNIST        | ~83 | 05:49:52 | ~5180 MB | ~83 | 00:08:22 | ~1770 MB  |
| RESNET_FEDCIFAR100 | ~34 | 15:55:36 | ~5530 MB | ~33 | 01:42:01 | ~1900 MB  |
| RNN_FEDSHAKESPEARE | ~57 | 06:46:21 | ~3690 MB | ~57 | 00:21:50 | ~1270 MB  |
 -----------------------------------------------------------------------------
```

You can find the examples above in [experiments](experiments).

### Flower Comparison

This comparison was carried out using Flower (Simulator) on version 1.0.0 at commit ID [4e7fad9](https://github.com/adap/flower/tree/4e7fad99389a5ee511730841b61f279e3359cb16) with the [lr_mnist](experiments/cv_lr_mnist/) task. Showing that in some cases FLUTE can outperform 53x faster.

```
 ________________________________________________
|        |    Flower (Ray)   | FLUTE (NCCL/Gloo) |
|        | Acc |    Time     | Acc |    Time     |
|--------|-----|-------------|-----|-------------|
| CPU    | ~80 |   00:30:14  | ~80 |   00:03:20  |
| GPU 2x | ~80 |   01:21:44  | ~80 |   00:01:31  |
| GPU 4x | ~79 |   00:56:45  | ~81 |   00:01:26  |
 ------------------------------------------------
```

You can find the example above in the [cv_lr_mnist](experiments/cv_lr_mnist/) folder.

## Quick Start

Install the requirements stated inside of `requirements.txt`. Ideally this sould be done inside of a virtual environment, for instance, using Anaconda.

```
conda create -n FLUTE python==3.7
pip install -r requirements.txt
```

FLUTE uses torch.distributed API as its main communication backbone, supporting three built-in backends. For more information please refer to [Distributed Communication Package](https://pytorch.org/docs/stable/distributed.html). Therefore, we highly suggest to use NCCL backend for distributed GPU training and Gloo for distributed CPU training. There is no `setup.py` as FLUTE is not currently distributed as a package, but instead meant to run from the root of the repository.

After this initial setup, you can use the data created for the integration test for a first local run. Note that this data needs to be download manually inside the `testing` folder, for more instructions please look at [the README file inside `testing`](testing/README.md).

For single-GPU runs:

```
python -m torch.distributed.run --nproc_per_node=1 e2e_trainer.py -dataPath ./testing -outputPath scratch -config testing/hello_world_nlg_gru.yaml -task nlg_gru -backend nccl
```

For multi-GPU runs (3 GPUs):

```
python -m torch.distributed.run --nproc_per_node=3 e2e_trainer.py -dataPath ./testing -outputPath scratch -config testing/hello_world_nlg_gru.yaml -task nlg_gru -backend nccl
```

The config file `testing/hello_world_nlg_gru.yaml` has some comments explaining the major sections and some important details; essentially, it consists in a very short experiment where a couple of iterations are done for just a few clients. A `scratch` folder will be created containing detailed logs.

## Documentation

Online documentation is available at https://microsoft.github.io/msrflute/

Locally, the documentation is inside the `doc/sphinx` folder. To build the docs on Linux:

```
$ pip install sphinx
$ cd doc/sphinx
$ make html
```

On Windows, you can use the `make.bat` script.  It may be necessary to `export PYTHONPATH=../../` for sphinx to find the code.

## Architecture

The core client/server training code is inside the `core` folder.

- Server-side federation and global DP application takes place in `server.py`, more specifically in the `OptimizationServer.train()` method.
- Client-side training updates take place in the static method `Client.process_round()`, inside `client.py`.

General FL orchestration code is in `federated.py`, but for most hub and spoke federation scenarios you won't need to touch this (unless you want to invest in optimizing server-client calls, which would be great!). Note that FLUTE does not implement secure aggregation since this is primarily a security feature for production scenarios; contributors are invited to add it for experimentation purposes.

The primary entry point for an experiment is in the script `e2e_trainer.py`. Primary config scripts for experiments are in `configs`. For instance, a basic training scenario for a next-word prediction task is set up in `hello_world_nlg_gru_json.yaml`.

Privacy accounting is expensive so the main parameters are logged and the actual accounting can be done offline. RDP privacy accounting is in `extensions/privacy/analysis.py`. A better accounting method is in the `dp-accountant` submodule.

## Customization

See `experiments` folder for illustrations of how dataloaders and models are customized. In order to in include a new experiment, the new scenario must be added following the same folder structure as `nlg_gru` and `mlm_bert`, naming the folder with the task.

## Experiments

Experiments are defined by YAML files, examples are provided in the `configs` folder. These can be run either locally or on AzureML.

For running experiments on AzureML, the CLI can help. You should first [install the CLI](https://docs.microsoft.com/en-us/azure/machine-learning/reference-azure-machine-learning-cli) (make sure you have v2) and [create a resource group and workspace](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace-cli?tabs=createnewresources%2Cvnetpleconfigurationsv1cli). You can then create a compute cluster, type `az ml compute create -h` for more info. Afterwards, you should write an YAML file with instructions for the job; we provide a simple example below

```yaml
experiment_name: basic_example
description: Basic example of AML config for submitting FLUTE jobs
code:
  local_path: .
compute: azureml:Test
environment:
  image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
inputs:
  data:
    folder: azureml://datastores/data/paths/cifar
    mode: rw_mount
command: >
  apt -y update &&
  apt -y install openmpi-bin libopenmpi-dev openssh-client &&
  python3 -m pip install --upgrade pip &&
  python3 -m pip install -r requirements.txt &&
  python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py
  -outputPath=./outputs
  -dataPath={inputs.data}
  -task=classif_cnn
  -config=./experiments/classif_cnn/config.yaml
  -backend=nccl
```

You should replace `compute` with the name of the one you created before, and adjust the path of the datastore containing the data -- in the example above, we created a datastore called `data` and added to it a folder called `cifar`, which contained the two HDF5 files. The command passed above will install dependencies and then launch a distributed job with 4 threads, for the experiment defined in `experiments/classif_cnn`. Details on how to run a job using the AzureML CLI are given [in its documentation](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli), but typically it suffices to set up the environment and type `az ml job create -f <name-of-the-yaml-file>`. In the same page of the documentation, you can also find more info about how to set up the YAML file above, in case other changes are needed.

Note that the `local_path` above is relative to the location of the YAML file, so setting it to `.` assumes it is in the same folder as `e2e_trainer.py`. All files on this folder will be uploaded to Azure, including hidden folders such as `.git`, so make sure to temporarily get rid of large files and folders that are not needed.

After launching the experiment, you can follow it on AzureML Studio, which prints logs, plots metrics and makes the output easily available after the experiment is finished.

## Privacy Accounting

Accounting is expensive, so we log all the privacy parameters so that accounting can be run offline. Best run on a Linux box with a GPU.
In particular, we use a DP accountant from another Microsoft repository, which is included in ours as a submodule. For using this accountant, just follow the instructions below:

```
$ git submodule update --init --recursive
$ cd utils
$ cd dp-accountant
$ python setup.py install
$ ./bin/compute-dp-epsilon --help
usage: compute-dp-epsilon [-h] -p SAMPLING_PROBABILITY -s NOISE_MULTIPLIER -i ITERATIONS -d DELTA
```
## Third Party Notice

This software includes the files listed below from the Huggingface/Transformers Library (https://github.com/huggingface/transformers) as part of task performance and preprocessing pretrained models.

    experiments/mlm_bert
    └── utils
        ├── trainer_pt_utils.py
        └── trainer_utils.py

This software includes the file extensions/privacy/analysis.py from the Tensorflow/Privacy Library (https://github.com/tensorflow/privacy) as part of Renyi Differential Privacy implementation.

This software includes the script testing/build_vocab.py from LEAF Library (https://github.com/TalwalkarLab/leaf) to create the vocabulary needed to run a testing job. 

This software includes the model implementation of the example ECG Classification | CNN LSTM Attention Mechanism from Kaggle Competition (https://www.kaggle.com/polomarco/ecg-classification-cnn-lstm-attention-mechanism) to reproduce the [ecg_cnn](experiments/ecg_cnn/model.py) experiment.

This software includes the model implementation of the FedNewsRec repository (https://github.com/taoqi98/FedNewsRec)| Code from the paper "Privacy-Preserving News Recommendation Model Learning" (https://arxiv.org/abs/2003.09592) ported to PyTorch framework to reproduce the [fednewsrec](experiments/fednewsrec/model.py) experiment.
For more information about third-party OSS licence, please refer to [NOTICE.txt](NOTICE.txt).

This software includes the Data Augmentation scripts of the Fast AutoAugment repository (https://github.com/kakaobrain/fast-autoaugment) to preprocess the data used in the [semisupervision](experiments/semisupervision/dataloaders/cifar_dataset.py) experiment.

This software included the FedProx logic implementation of the NIID-Bench repository (https://github.com/Xtra-Computing/NIID-Bench/tree/main) as Federated aggregation method used in the [trainer](core/trainer.py) object.
## Support

You are welcome to open issues on this repository related to bug reports and feature requests.

## Contributing

Contributions are welcomed and encouraged. For details on how to contribute, please see [CONTRIBUTING.md](CONTRIBUTING.md).


================================================
FILE: SECURITY.md
================================================
<!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->

## Security

Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).

If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.

## Reporting Security Issues

**Please do not report security vulnerabilities through public GitHub issues.**

Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).

If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).

You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 

Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:

  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
  * Full paths of source file(s) related to the manifestation of the issue
  * The location of the affected source code (tag/branch/commit or direct URL)
  * Any special configuration required to reproduce the issue
  * Step-by-step instructions to reproduce the issue
  * Proof-of-concept or exploit code (if possible)
  * Impact of the issue, including how an attacker might exploit the issue

This information will help us triage your report more quickly.

If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.

## Preferred Languages

We prefer all communications to be in English.

## Policy

Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).

<!-- END MICROSOFT SECURITY.MD BLOCK -->


================================================
FILE: azure-pipelines.yml
================================================
trigger:
- main

pool:
  vmImage: 'windows-latest'

steps:
- task: CredScan@2
  inputs:
    toolMajorVersion: 'V2'

- task: Semmle@1
  env: 
    SYSTEM_ACCESSTOKEN: $(System.AccessToken)
  inputs:
    sourceCodeDirectory: '$(Build.SourcesDirectory)'
    language: 'python'
    querySuite: 'Recommended'
    timeout: '1800'
    ram: '16384'
    addProjectDirToScanningExclusionList: true

- task: ComponentGovernanceComponentDetection@0
  inputs:
    scanType: 'Register'
    verbosity: 'Verbose'
    alertWarningLevel: 'High'

- task: PublishSecurityAnalysisLogs@2
  inputs:
    ArtifactName: 'CodeAnalysisLogs'
    ArtifactType: 'Container'
    AllTools: true
    ToolLogsNotFoundAction: 'Standard'

================================================
FILE: configs/hello_world_mlm_bert_json.yaml
================================================
# Basic configuration file for running mlm_bert example using json files.
# Parameters needed to initialize the model
model_config:
    model_type: BERT 
    model_folder: experiments/mlm_bert/model.py
    BERT:
        loader_type: text
        model:
            model_name: roberta-large
            cache_dir: ./cache_dir
            use_fast_tokenizer: False
            mask_token: <mask>
            task: mlm
            past_index: -1
            prediction_loss_only: false
            process_line_by_line: false
        training:
            seed: 12345
            label_smoothing_factor: 0  
            batch_size: 64
            max_seq_length: 256            

# Configuration for differential privacy
dp_config:
    enable_local_dp: false  # If enabled, the rest of parameters is needed. 
    enable_global_dp: false # Local dp clips and adds noise on the client and centrally accumulates the privacy budget
    eps: 100                # epsilon
    global_sigma: 0.35      # Used when global dp es enabled, specifies the global Gaussian noise
    weight_scaler: 0.0001   # indicates how the aggregation weights scaled before noise addition, and unscaled afterwards.
    max_grad: 0.008         # max gradient
    max_weight: 0.5         # The max_weight and min_weight should be already scaled by weight_scaler
    min_weight: 0.0000001   # Because we scale down the weight using weight_scalar -> clip -> add noise -> scale back up.

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false    # If enabled, the rest of parameters is needed. 

# Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx)
strategy: DGA

# Determines all the server-side settings for training and evaluation rounds
server_config:
    resume_from_checkpoint: true                    # Resumes from latest checkpoint iteration if available 
    do_profiling: false                             # Capture profiling information during server updates.
    fast_aggregation: true                          
    wantRL: false                                   # Enable/Disable Reinforcement learning
    RL:                                             # Reinforcement Learning parameters
        RL_path_global: false
        marginal_update_RL: true
        RL_path: ./RL_models
        model_descriptor_RL: marginalUpdate
        network_params: 300,128,128,128,64,100
        initial_epsilon: 0.5
        final_epsilon: 0.0001
        epsilon_gamma: 0.90
        max_replay_memory_size: 1000
        minibatch_size: 16
        gamma: 0.99
        optimizer_config:
            lr: 0.0003
            type: adam
            amsgrad: true
        annealing_config:
            type: step_lr
            step_interval: epoch
            step_size: 1
            gamma: 0.95
    optimizer_config:                               # Configuration for server-side optimizer
        lr: 0.00001                                 
        weight_decay: 0.01
        type: adamW
    annealing_config:                               # This section configures how the learning rate decays
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 1000
    val_freq: 4                                     # Frequency for validation rounds
    rec_freq: 16                                    # Frequency for testing rounds
    initial_val : true                              # Enable initial validation round at itr=0
    initial_rec: false                              # Enable initial testing round at itr=0
    max_iteration: 10000                            # Total number of rounds for FL
    num_clients_per_iteration: 200                  # Number of clients sampled per round
    data_config:                                    # Server-side data configuration
        val:                                        # Validation data
            val_data: <add path to data here>
            task: mlm
            mlm_probability: 0.25
            tokenizer_type_fast: False
            batch_size: 128
            max_seq_length: 256
            min_words_per_utt: 5
            max_samples_per_user: 5000
            mask_token: <mask>
            num_workers: 0
            prepend_datapath: false
            cache_dir: ./cache_dir
        # Note this is NOT the main training data configuration, which is configured in the 
        # client config.  This section is ignored unless you are running replay data.
        # If you want to run replay data- set a path name for train_data_server.
        # train:
        #     loader_type: text
        #     train_data: null
        #     train_data_server: null
        #     desired_max_samples: null
        test:                                       # Test data configuration
            test_data: <add path to data here>
            task: mlm
            mlm_probability: 0.25
            tokenizer_type_fast: False
            batch_size: 128
            max_seq_length: 256
            max_samples_per_user: 5000
            mask_token: <mask>
            num_workers: 0
            prepend_datapath: false
            cache_dir: ./cache_dir
    type: model_optimization                        # Server type
    aggregate_median: softmax                       # FL aggregation method
    weight_train_loss: mag_mean_loss                # Determines how each client's weight is computed (e.g. grad_mean_loss, train_loss)
    softmax_beta: 1.00                              
    initial_lr_client: 0.00001
    lr_decay_factor: 1.0
    best_model_criterion: loss                      # Determine the best model based on minimal loss, for checkpointing
    fall_back_to_best_model: false                  # If a model degrades, use the previous best model
    # server_replay_config:                           # This is only applies if the server-side training data is fully configured and loaded
    #     server_iterations: 50
    #     optimizer_config:
    #         lr: 0.00002
    #         amsgrad: true
    #         type: adam

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    meta_learning: basic
    stats_on_smooth_grad: true
    ignore_subtask: false
    copying_train_data: false
    do_profiling: false                             # Enables client-side training profiling
    data_config:
        train:                                      # This is the main training data configuration
            list_of_train_data: <add path to data here>
            task: mlm
            mlm_probability: 0.25
            tokenizer_type_fast: False
            batch_size: 24
            max_seq_length: 256
            min_words_per_utt: 5
            desired_max_samples: 5000
            mask_token: <mask>
            num_workers: 0
            num_frames: 0
            max_grad_norm: 15.0
            prepend_datapath: false
            cache_dir: ./cache_dir
            pin_memory: true
    type: optimization
    meta_optimizer_config:
        lr: 0.01
        type: adam
    optimizer_config:
        type: adamW
        weight_decay: 0.01
        amsgrad: true
    annealing_config:
        type: step_lr
        step_interval: epoch
        step_size: 2
        gamma: 1.0

================================================
FILE: configs/hello_world_nlg_gru_json.yaml
================================================
# Basic configuration file for running nlg_gru example using json files.
# Parameters needed to initialize the model
model_config: 
    model_type: GRU
    model_folder: experiments/nlg_gru/model.py
    pretrained_model_path: <add path to pretrained weights here>
    embed_dim: 160
    vocab_size: 10000
    hidden_dim: 512
    OOV_correct: false

# Configuration for differential privacy
dp_config:
    enable_local_dp: false      # If enabled, the rest of parameters is needed. 
    # enable_local_dp: true     # Local dp clips and adds noise on the client and centrally accumulates the privacy budget
    # eps: 100                  # epsilon
    # max_grad: 0.008           # max gradient
    # weight_scaler: 0.0001     # indicates how the aggregation weights scaled before noise addition, and unscaled afterwards.
    # max_weight: 0.0001        # The max_weight and min_weight should be already scaled by weight_scaler
    # min_weight: 0.00009       # Because we scale down the weight using weight_scalar -> clip -> add noise -> scale back up.

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false             # If enabled, the rest of parameters is needed. 
    # apply_indices_extraction: true   # If we extract word indices we want to consider the rank of the words extracted.
    # allowed_word_rank: 9000          # Any word that rank above this value is considered privacy risk
    # apply_leakage_metric: true
    # max_leakage: 30
    # max_allowed_leakage: 3
    # adaptive_leakage_threshold: 0.95 # Takes the 95th percentile of the leakage for the next round.
    # is_leakage_weighted: true
    # attacker_optimizer_config:
    #     lr: 0.03
    #     type: adamax
    #     amsgrad: false

# Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx)
strategy: FedProx

# Determines all the server-side settings for training and evaluation rounds
server_config:   
    wantRL: false                   # Enable/Disable Reinforcement learning
    resume_from_checkpoint: true    # Resumes from latest checkpoint iteration if available 
    do_profiling: false             # Capture profiling information during server updates.
    optimizer_config:               # Configuration for server-side optimizer
        type: lamb
        lr: 0.1
        weight_decay: 0.005
    annealing_config:               # This section configures how the learning rate decays
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 2                     # Frequency for validation rounds
    rec_freq: 4                     # Frequency for testing rounds
    initial_val : true              # Enable initial validation round at itr=0
    initial_rec: false             # Enable initial testing round at itr=0
    max_iteration: 11               # Total number of rounds for FL
    num_clients_per_iteration: 10   # Number of clients sampled per round
    data_config:                    # Server-side data configuration
        val:                        # Validation data
            batch_size: 2048
            tokenizer_type: not_applicable
            prepend_datapath: false
            val_data: <add path to data here>       # Path for validation data
            vocab_dict: <add path to vocab here>    # Path for vocabulary
            pin_memory: true
            num_workers: 0                          # Indicates how many workers are used for creating batches
            num_frames: 2400                        
            max_batch_size: 2048
            max_num_words:  25
            unsorted_batch: true
        # Note this is NOT the main training data configuration, which is configured in the 
        # client config.  This section is ignored unless you are running replay data.
        # If you want to run replay data- set a path name for train_data_server.
        # train:                                      
        #     batch_size: 128
        #     loader_type: text
        #     tokenizer_type: not_applicable
        #     prepend_datapath: false
        #     train_data: null
        #     train_data_server: null
        #     vocab_dict: <add path to vocab here>
        #     pin_memory: true
        #     num_workers: 0
        #     num_frames: 2400
        #     desired_max_samples: 500
        #     max_grad_norm: 10.0
        #     max_batch_size: 128
        #     max_num_words:  25
        #     unsorted_batch: true
        test:                                       # Test data configuration
            batch_size: 2048
            tokenizer_type: not_applicable
            prepend_datapath: false
            train_data: null
            train_data_server: null
            test_data: <add path to data here>      # Path for validation data
            vocab_dict: <add path to vocab here>    # Path for vocabulary
            pin_memory: true
            num_workers: 0                          # Indicates how many workers are used for creating batches
            max_batch_size: 2048
            max_num_words:  25
            unsorted_batch: true
    type: model_optimization
    aggregate_median: softmax                       # FL aggregation method
    weight_train_loss: train_loss                   # Determines how each client's weight is computed (e.g. grad_mean_loss, train_loss)
    softmax_beta: 20.0
    initial_lr_client: 1.0
    lr_decay_factor: 1.0
    best_model_criterion: loss                      # Determine the best model based on minimal loss, for checkpointing
    fall_back_to_best_model: false                  # If a model degrades, use the previous best model
    # server_replay_config:                           # This is only applies if the server-side training data is fully configured and loaded
    #     server_iterations: 50
    #     optimizer_config:
    #         type: adam
    #         lr: 0.00002
    #         amsgrad: true
    
# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    mu: 0.001                                           # Used only for FedProx aggregation method
    meta_learning: basic
    stats_on_smooth_grad: true
    ignore_subtask: false
    num_skips_threshold: 10
    copying_train_data: false
    do_profiling: false                                 # Enables client-side training profiling
    data_config:
        train:                                          # This is the main training data configuration
            batch_size: 64
            tokenizer_type: not_applicable
            prepend_datapath: false
            list_of_train_data: <add path to data here> # Path to training data
            vocab_dict: <add path to vocab here>        # Path to vocabulary
            pin_memory: true
            num_workers: 0
            desired_max_samples: 50000
            max_grad_norm: 20.0
            max_batch_size: 128
            max_num_words:  25
            unsorted_batch: true
    type: optimization
    meta_optimizer_config:
        lr: 1.0
        type: sgd
    optimizer_config:
        type: sgd
    annealing_config:
        type: step_lr
        step_interval: epoch
        step_size: 1
        gamma: 1.0

================================================
FILE: core/__init__.py
================================================


================================================
FILE: core/client.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
The Client object is short-lived, instantiated inside workers 1 to N for 
processing a given client's data. It's main method is the `process_round` 
function, used to update the model given a client's data.
'''

import copy
import logging
import os
import time

from easydict import EasyDict as edict
from importlib.machinery import SourceFileLoader
import numpy as np
import torch

# Internal imports
import core.federated as federated
from .strategies import select_strategy
from .trainer import (
    Trainer,
    run_validation_generic,
    set_component_wise_lr,
)
from utils import (
    ScheduledSamplingScheduler,
    make_optimizer,
    print_rank,
    to_device,
    convex_inference,
    alpha_update,
)
from utils.dataloaders_utils import (
    make_train_dataloader,
    make_val_dataloader,
    make_test_dataloader,
    get_dataset,
)
import extensions.privacy
from extensions.privacy import metrics as privacy_metrics
from experiments import make_model

global train_dataset
global trainset_unlab
global trainset_unlab_rand

class Client:
    # It's unclear why, but sphinx refuses to generate method docs
    # if there is no docstring for this class.
    """Client class for specifying individual client training tasks"""

    def __init__(self, client_id, config, send_gradients):
        '''
        Client side processing: computing gradients, update the model and send them back to the server

        Args:
            client_id (int): identifier for grabbing that client's data.
            config (dict): dictionary with parameters loaded from config file.
            send_gradients (bool): if True, model gradients are sent back;
                otherwise, model weights are sent back.
        '''
        super().__init__()
        
        self.client_id = client_id
        self.config = copy.deepcopy(config)
        self.send_gradients = send_gradients

    def get_client_data(self, dataset=None):
        '''"Getter" method that returns all object's attributes at once.'''

        client_data = self.get_data(self.client_id, dataset)
        return self.client_id, client_data, self.config, self.send_gradients

    @staticmethod
    def get_train_dataset(data_path, client_train_config, task):
        '''This function will obtain the dataset for all training
        users.

        Args:
            data_path (str): path to file containing taining data.
            client_train_config (dict): trainig data config.
            task (str): task name.
        '''
        global train_dataset
        global trainset_unlab
        global trainset_unlab_rand

        train_dataset = get_dataset(data_path, client_train_config, task, mode="train")

        if task == 'semisupervision':
            trainset_unlab = get_dataset(data_path, client_train_config, task, mode="train", user_idx = -2)
            trainset_unlab_rand = get_dataset(data_path, client_train_config, task, mode="train", user_idx = -3)
        else:
            trainset_unlab = None
            trainset_unlab_rand = None

        return len(train_dataset.user_list)

    @staticmethod
    def get_data(clients, dataset):
        ''' Create training dictionary'''

        if dataset == None: # Training case
            datasets = [train_dataset, trainset_unlab, trainset_unlab_rand] if trainset_unlab != None else [train_dataset]
        else: # Evaluation case
            datasets = [dataset]

        data_with_labels = hasattr(datasets[0],"user_data_label")
        
        strcts = [] # Returning list length will always be 1 except when the task is semisupervision
        for dataset in datasets:
            input_strct = {'users': [], 'num_samples': [],'user_data': dict(), 'user_data_label': dict()} if data_with_labels else {'users': [], 'num_samples': [],'user_data': dict()}
            for client in clients:
                user = dataset.user_list[client]
                input_strct['users'].append(user)
                input_strct['num_samples'].append(dataset.num_samples[client])
                input_strct['user_data'][user]= dataset.user_data[user]
                if data_with_labels: 
                    input_strct['user_data_label'][user] = dataset.user_data_label[user]
            strcts.append(edict(input_strct))
        
        return strcts 

    @staticmethod
    def run_testvalidate(client_data, server_data, mode, model):
        '''Called by worker to run test/validation sample on a client.

        This functions assumes set_model_for_round has already been called to
        push the model to the client (see federated.py).

        Args:
            client_data (tuple): client data and config. It is a tuple with 3
                components; importantly, the second component is a dict
                containing the data, and the third component is a dict with the
                config parsed from the YAML file.
            server_data (tuple): server data (model parameters mostly). It is
                a tuple with 2 components; importantly, the second component
                consists of the current model parameters.
            mode (str): whether to `test` or `validate`.
            model (torch.nn.Module): actual model without parameters.
        '''

        # Process inputs and initialize variables
        _, data_strcts, config, _ = client_data
        _, model_parameters, iteration = server_data
        config = copy.deepcopy(config)
        model_path = config["model_path"]

        begin = time.time()  

        # Use the server's data config since we're distributing test/validate from the server
        data_strct = data_strcts[0]
        data_config = config['server_config']['data_config'][mode]
        want_logits = data_config.get('wantLogits', False)
        send_dicts = config['server_config'].get('send_dicts', False)

        # Create dataloader 
        dataloader = None
        print_rank('making dataloader with task {}'.format(config['server_config']['task']), loglevel=logging.DEBUG)
        if mode == 'test':
            dataloader = make_test_dataloader(data_config, data_path=None, task=config['server_config']['task'], data_strct=data_strct)
        elif mode == 'val':
            dataloader = make_val_dataloader(data_config, data_path=None, task=config['server_config']['task'], data_strct=data_strct)

        # Set model parameters
        n_layers, n_params = len([f for f in model.parameters()]), len(model_parameters)
        print_rank(f'Copying model parameters... {n_layers}/{n_params}', loglevel=logging.DEBUG)
        
        model = to_device(model)
        
        if send_dicts: # Send model state dictionary
            tmp = {}
            for param_key, param_dict in zip (model.state_dict(), model_parameters):
                tmp[param_key] = param_dict
            model.load_state_dict(tmp)
        else: # Send parameters
            for p, data in zip(model.parameters(), model_parameters):
                p.data = data.detach().clone().cuda() if torch.cuda.is_available() else data.detach().clone()

        print_rank(f'Model setup complete. {time.time() - begin}s elapsed.', loglevel=logging.DEBUG)

        # Compute output and metrics on the test or validation data
        num_instances = sum(data_strct['num_samples'])
        print_rank(f'Validating {num_instances}', loglevel=logging.DEBUG)
        output, metrics = run_validation_generic(model, dataloader)
        
        # Load local model if necessary
        if config['server_config']['type']=='personalization':

            local_model = make_model(config['model_config'])
            user = data_strct['users'][0]

            local_model_name = os.path.join(model_path, user + '_model.tar')

            if os.path.exists(local_model_name):
                print_rank('Loading Local Model .. {}'.format(local_model_name))
                checkpoint = torch.load(local_model_name)
                local_model.load_state_dict(checkpoint["model_state_dict"])

                local_alpha_name = os.path.join(model_path, user + '_alpha')
                if os.path.exists(local_alpha_name):
                    alpha = torch.load(local_alpha_name)
                    print_rank('Loading Alpha Weight from {}: Value={}'.format(local_model_name, alpha))

                    # Run inference and get logits back
                    if mode == 'test':
                        dataloader = make_test_dataloader(data_config, data_path=None, task=config['server_config']['task'], data_strct=data_strct)
                    elif mode == 'val':
                        dataloader = make_val_dataloader(data_config, data_path=None, task=config['server_config']['task'], data_strct=data_strct)

                    output_local, local_metrics = run_validation_generic(local_model, dataloader)
                    loss_local = local_metrics['loss']['value']
                    cer = local_metrics['acc']['value']
                    # Combine logits
                    cer =convex_inference(output, output_local, alpha=alpha)
                    metrics['loss']['value'] = (metrics['loss']['value'] + loss_local) / 2 
                    metrics['acc']['value'] = cer
        output = None if not want_logits else output

        return output, metrics, num_instances


    @staticmethod
    def process_round(client_data, server_data, model, data_path, eps=1e-7):
        '''Compute gradients given client's data and update model.

        Args:
            client_data (tuple): client data and config. It is a tuple
                consisting of 4 components: an int indicating the client's id, a
                dict containing that client's data, a dict with the config
                parsed from the YAML file, and a bool indicating whether or not
                gradients should be sent.
            server_data (tuple): server data (model parameters mostly). It is
                a tuple consisting of 2 components; importantly, the first is
                a float giving the client's learning rate, and the second a list
                of torch.Tensor's with current model parameters. 
            model (torch.nn.Module): actual model without parameters.
            data_path (str): where to get data from.
            eps (float): lower bound for aggregation weights.
        '''

        # Ensure the client is assigned to the correct GPU
        if torch.cuda.is_available() and torch.cuda.device_count() == federated.size():
            torch.cuda.set_device(federated.local_rank())

        # Process inputs and initialize variables
        client_id, data_strcts, config, send_gradients = client_data
        initial_lr, model_parameters, iteration = server_data
        config = copy.deepcopy(config)

        model_config = config['model_config']
        client_config = config['client_config']
        data_config = client_config['data_config']['train']
        semisupervision_config = client_config.get('semisupervision',None)
        task = client_config.get('task', {})
        trainer_config = client_config.get('trainer_config', {})
        privacy_metrics_config = config.get('privacy_metrics_config', None)
        model_path = config["model_path"]

        strategy_algo = config['strategy']
        StrategyClass = select_strategy(strategy_algo)
        strategy = StrategyClass('client', config)
        print_rank(f'Client successfully instantiated strategy {strategy}', loglevel=logging.DEBUG)
        send_dicts = config['server_config'].get('send_dicts', False)

        begin = time.time()  
        client_stats = {}  

        data_strct = data_strcts[0]
        user = data_strct['users'][0]
        print_rank('Loading : {}-th client with name: {}, {} samples, {}s elapsed'.format(
            client_id[0], user, data_strct['num_samples'][0], time.time() - begin), loglevel=logging.INFO)

        # Get dataloaders
        train_dataloader = make_train_dataloader(data_config, data_path, task=task, clientx=0, data_strct=data_strct)

        # Instantiate the model object
        if model is None:
            model = make_model(
                model_config,
                dataloader_type=train_dataloader.__class__.__name__,
                input_dim=data_config['input_dim'],
                vocab_size=train_dataloader.vocab_size,
            )
        
        # Set model parameters
        n_layers, n_params = len([f for f in model.parameters()]), len(model_parameters)
        print_rank(f'Copying model parameters... {n_layers}/{n_params}', loglevel=logging.DEBUG)
        model = to_device(model)

        if send_dicts: # Send model state dictionary
            tmp = {}
            for param_key, param_dict in zip (model.state_dict(), model_parameters):
                tmp[param_key] = param_dict
            model.load_state_dict(tmp)
        else: # Send parameters
            for p, data in zip(model.parameters(), model_parameters):
                p.data = data.detach().clone().cuda() if torch.cuda.is_available() else data.detach().clone()
        print_rank(f'Model setup complete. {time.time() - begin}s elapsed.', loglevel=logging.DEBUG)


        # Fix parameters of layers
        if 'updatable_names' in trainer_config:
            set_component_wise_lr(model, client_config['optimizer_config'], trainer_config['updatable_names'])

        # Create the optimizer on the workers
        # NOTE: the server dictates the learning rate for the clients
        client_config['optimizer_config']['lr'] = initial_lr
        optimizer = make_optimizer(client_config['optimizer_config'], model)

        # Make the scheduled sampling scheduler
        ss_scheduler = None
        if 'ss_config' in client_config and client_config['ss_config'] is not None:
            ss_scheduler = ScheduledSamplingScheduler(model=model, **client_config['ss_config'])

        # Make the trainer
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            ss_scheduler=ss_scheduler,
            train_dataloader=train_dataloader,
            server_replay_config =client_config,
            max_grad_norm=client_config['data_config']['train'].get('max_grad_norm', None),
            anneal_config=client_config['annealing_config'] if 'annealing_config' in client_config else None,
            num_skips_threshold=client_config['num_skips_threshold'] if 'num_skips_threshold' in client_config else -1,
            ignore_subtask=client_config['ignore_subtask']
        )

        if trainer.optimizer is not None:
            initial_optimizer_state = copy.deepcopy(trainer.optimizer.state_dict())

        annealing_config = client_config['annealing_config'] if 'annealing_config' in client_config else None

        assert 'desired_max_samples' in client_config['data_config']['train'], 'Missing \'desired_max_samples\' entry in data config parameter'
        desired_max_samples = client_config['data_config']['train']['desired_max_samples']

        if trainer.optimizer is not None:  # reset the optimizer state
            if initial_lr > 0:
                trainer.optimizer.param_groups[0].update({'lr': initial_lr})
            initial_optimizer_state = copy.deepcopy(trainer.optimizer.state_dict())
            trainer.reset_optimizer(initial_optimizer_state, annealing_config)

        # Mark the end of setup
        end = time.time()
        client_stats['setup'] = end - begin
        print_rank(f'Client setup cost {client_stats["setup"]}s', loglevel=logging.DEBUG)               
        begin_training = end
        
        # Training begins here
        trainer.model.train()
        trainer.model.zero_grad()

        # Save the client batches if we want to evaluate the privacy metrics
        apply_privacy_metrics = (False if privacy_metrics_config is None else privacy_metrics_config['apply_metrics'])

        # This is where training actually happens
        algo_payload = None

        if strategy_algo == 'FedLabels':
            datasets =[get_dataset(data_path, config, task, mode="train", test_only=False, data_strct=data_strcts[i], user_idx=0) for i in range(3)]
            algo_payload = {'strategy':'FedLabels', 'data': datasets, 'iter': iteration, 'config': semisupervision_config}
        elif strategy_algo == 'FedProx':
            algo_payload = {'strategy':'FedProx', 'mu': client_config.get('mu',0.001)}
        train_loss, num_samples, algo_computation = trainer.train_desired_samples(desired_max_samples=desired_max_samples, apply_privacy_metrics=apply_privacy_metrics, algo_payload = algo_payload)
        print_rank('client={}: training loss={}'.format(client_id[0], train_loss), loglevel=logging.DEBUG)

        # Estimate gradient magnitude mean/var
        # Now computed when the sufficient stats are updated.
        assert 'sum' in trainer.sufficient_stats
        assert 'mean' in trainer.sufficient_stats
        
        trainer.train_loss = train_loss
        trainer.num_samples = num_samples
        trainer.algo_computation = algo_computation

        # Compute pseudo-gradient
        if not send_dicts:
            for p, data in zip(trainer.model.parameters(), model_parameters):
                data = to_device(data)
                p.grad = data - p.data

        payload = strategy.generate_client_payload(trainer) if send_gradients else None

        if config['server_config']['type'] == 'personalization':
            # Initialize convex weight alpha
            alpha= config['client_config'].get('convex_model_interp', 0.75)
            local_model = make_model(config['model_config'])
            train_dataloader = make_train_dataloader(data_config, data_path, task=task, clientx=0, data_strct=data_strct)
            local_optimizer = make_optimizer(client_config['optimizer_config'], local_model)

            # Make the trainer
            local_trainer = Trainer(
                model=local_model,
                optimizer=local_optimizer,
                ss_scheduler=ss_scheduler,
                train_dataloader=train_dataloader,
                server_replay_config=client_config,
                max_grad_norm=client_config['data_config']['train'].get('max_grad_norm', None),
                anneal_config=client_config['annealing_config'] if 'annealing_config' in client_config else None,
                num_skips_threshold=client_config[
                    'num_skips_threshold'] if 'num_skips_threshold' in client_config else -1,
                ignore_subtask=client_config['ignore_subtask']
            )

            local_model_name = os.path.join(model_path, user + '_model.tar')
            local_alpha_name = os.path.join(model_path, user + '_alpha')

            if os.path.exists(local_model_name):
                print_rank('Loading Local Model .. {}'.format(local_model_name))
                local_trainer.load(local_model_name, update_lr_scheduler=False, update_ss_scheduler=False)

            if os.path.exists(local_alpha_name):
                print_rank('Loading Alpha Weight .. {}'.format(local_model_name), loglevel=logging.INFO)
                alpha = torch.load(local_alpha_name)

            # Copy original model
            original_local_model = local_trainer.get_model()

            # Training begins here
            local_trainer.model.train()
            local_trainer.model.zero_grad()

            # Run Local Processing
            train_loss, num_samples = local_trainer.train_desired_samples(desired_max_samples=desired_max_samples,
                                                                          apply_privacy_metrics=False)
            print_rank('client={}, user:{}: LOCAL training loss={}'.format(client_id[0], user, train_loss), loglevel=logging.INFO)

            local_trainer.save(
                model_path=model_path,
                config=config,
                token=user)

            # Estimate the pseudo-gradient for local model
            for p, orig_param in zip(local_trainer.model.parameters(), original_local_model.parameters()):
                orig_param = orig_param.cuda() if torch.cuda.is_available() else orig_param
                p.grad = orig_param.data - p.data

            alpha= alpha_update(local_trainer.model, trainer.model, alpha, initial_lr)
            torch.save(alpha, local_alpha_name)
            local_trainer.model.zero_grad()


        # Mark that training (including post-processing) is finished
        end = time.time()
        client_stats['training'] = end - begin_training
        client_stats['full cost'] = end - begin
        print_rank(f'Client training cost {end - begin_training}s', loglevel=logging.DEBUG)      
        print_rank(f'Client full cost {end - begin}s', loglevel=logging.DEBUG)

        # Create dictionary that is sent back to server
        client_output = {
            'cs': client_stats, 
            'tl': train_loss, 
            'mg': trainer.sufficient_stats['mag'],
            'vg': trainer.sufficient_stats['var'],
            'ng': trainer.sufficient_stats['mean'],
            'rg': trainer.sufficient_stats['norm'],
            'ns': num_samples,
            'pl': payload,
        }
       
        # Apply privacy metrics
        if privacy_metrics_config and privacy_metrics_config['apply_metrics']:
            print_rank('Applying privacy metrics', loglevel=logging.DEBUG)

            privacy_stats = {'Dropped clients': 0}
            batches = trainer.cached_batches
            trainer.cached_batches = []
            gradients = extensions.privacy.unroll_network(model.named_parameters(), select_grad=True)[0]

            if privacy_metrics_config['apply_indices_extraction']:
                allowed_word_rank = privacy_metrics_config.get('allowed_word_rank', 9000)
                embed_dim, vocab_size = model_config['embed_dim'], model_config['vocab_size']
                overlap, indices = privacy_metrics.extract_indices_from_embeddings(gradients, batches, embed_dim, vocab_size)

                max_overlap =  privacy_metrics_config.get('max_allowed_overlap', None)
                if max_overlap is not None and overlap > max_overlap:
                    print_rank('Removing this client because we extracted {}% words and the maximum allowed is {}%'.format(overlap * 100, max_overlap * 100))
                    client_output['wt'] = 0.0
                    privacy_stats['Dropped clients'] = 1

                privacy_stats['Extracted indices percentage'] = overlap
                privacy_stats['Words percentage above ' + str(allowed_word_rank) + ' word rank'] = (indices > allowed_word_rank).mean() if len(indices)>0 else 0
          
            if privacy_metrics_config['apply_leakage_metric']:
                print_rank('Applying leakage metric', loglevel=logging.DEBUG)

                orig_params = {n: p for (n, _), p in zip(trainer.model.named_parameters(), model_parameters)}
                max_ratio = np.exp(privacy_metrics_config['max_leakage'])
                optim_config = privacy_metrics_config['attacker_optimizer_config']
                is_leakage_weighted = privacy_metrics_config['is_leakage_weighted']

                leakage = privacy_metrics.practical_epsilon_leakage(orig_params,
                    trainer.model, batches, is_leakage_weighted, max_ratio, optim_config)                
                print_rank('privacy leakage: {}'.format(leakage), loglevel=logging.DEBUG)

                max_leakage =  privacy_metrics_config.get('max_allowed_leakage', None)
                if max_leakage is not None and leakage > max_leakage:
                    print_rank('Removing this client because the information leakage/practical epsilon is {} and the maximum allowed is {}'.format(leakage, max_leakage))
                    client_output['wt'] = 0.0
                    privacy_stats['Dropped clients'] = 1

                privacy_stats['Practical epsilon (Max leakage)'] = leakage
            
            client_output['ps'] = privacy_stats

        client_output['ts'] = time.time()
        return client_output


================================================
FILE: core/config.py
================================================
# Note this import requires python 3.7+
# Do we want to commit to this?
from __future__ import annotations
from dataclasses import dataclass
from collections.abc import MutableMapping
from cerberus import Validator
from importlib.machinery import SourceFileLoader
from utils.utils import print_rank
from importlib.machinery import SourceFileLoader
import os


# TODO everywhere: choose reasonable defaults.
# TODO: decide where task should live as a setting, maybe its own TaskConfig
# TODO: docstrings everywhere

# TODO: Make ModelConfig a base class that different models inherit from
# We could specify the modelconfig class in the config file,
# like we do for model.py.  The current implementation mixes NLG and BERT

# TODO: DatasetConfig needs to be teased apart.
# The main issue is we have *_data, list_of_train_data, train_data_server.
# They all essentially perform the same function in different contexts.
# also some no-longer-used parameters are still present.

# TODO: it's not clear what MutableMapping methods need overrides- we
# could probably just use the default implementation.

# TODO: not all pytorch optimizers can handle amsgrad - we should
# have distinct subclasses for the different optimizers

def from_dict(cls, config):
    """
    Helper function to convert a dict to a class
    """
    return cls(**config)


class Config(MutableMapping):
    """Base class for configuration classes."""
    def get(self, k: str, default=None):
        result = getattr(self, k, default)
        if result is None:
            return default
        return result

    def lookup(self, s: str, default=None):
        toks = s.split('.')
        child = getattr(self, toks[0], default)
        if len(toks) == 1:
            return child if child is not None else default
        elif isinstance(child, Config):
            return child.lookup('.'.join(toks[1:]), default)
        else:
            return default

    def __getitem__(self, k):
        return getattr(self, k)

    def __setitem__(self, k, v):
        setattr(self, k, v)

    def __delitem__(self, k):
        delattr(self, k)

    def __iter__(self):
        return iter(self.__dict__)

    def __len__(self):
        return len(self.__dict__)

    def __contains__(self, k):
        return getattr(self, k, None) is not None

    def pop(self, k, default=None):
        result = self.get(k, default)
        if k in self:
            delattr(self, k)
        return result


@dataclass
class ModelConfig(Config):
    """Base class for Model configurations

The model configuration specifies model architecture, parameters, and initialization settings.

Attributes:
    model_type (str): The class name of the model to instantiate. eg GRU.

    model_folder (str): The relative path to the model.py file where model_type is defined. eg experiments/nlg_gru/model.py

    pretrained_model_path (str): The path to the pretrained model.  If None, the model will be randomly initialized using the method defined in weight_init.

"""
    model_type: str = None
    model_folder: str = None
    pretrained_model_path: str = None

    @staticmethod
    def from_dict(config) -> ModelConfig:
        """Searches the model folder for config.py and if it is found the model config 
        is initialized from the class [model_type]Config"""
        cfg_path = os.path.dirname("./" + str(config['model_folder'])) + '/config.py'
        if os.path.exists(cfg_path):
            loader = SourceFileLoader('config', cfg_path).load_module()
            config_class = config['model_type'] + 'Config'
            try:
                config_type = getattr(loader, config_class)
                return from_dict(config_type, config)
            except AttributeError:
                print_rank(f"Config class {config_class} not found in {cfg_path}")
                raise
        else:
            print_rank(f"Warning: couldn't find {cfg_path}, falling back to dictionary.")
            return config
            

@dataclass
class BERTModelConfig(Config):
    """BERT model configuration

The BERT configuration specifies huggingface-specific BERT model settings.

Attributes:
    model_name (str): The name of the BERT model.  eg bert-base-uncased.

    cache_dir (str): Tokenizer cache directory, will be created if it doesn't exist.

    use_fast_tokenizer (bool): Whether to use the fast tokenizer.

    mask_token (str): special token to use for masking.

    task (str): The task to use for BERT.  eg mlm.

    past_index (int): The index of the past state in the BERT model's state dict.

    prediction_loss_only (bool): if False, also produce metrics for predictions and labels.

    process_line_by_line (bool): if True, process the input line-by-line.

ToDo:
    * check how cache_dir is used- there's a risk of multiple processes reading/writing at the same time.
    * verify the meaning of past_index (thanks copilot)
    * document the difference when process_line_by_line is True vs False

    """
    model_name: str = None
    cache_dir: str = None
    use_fast_tokenizer: bool = False
    mask_token: str = '<mask>'
    task: str = 'mlm'
    past_index: int | None = -2
    prediction_loss_only: bool = False
    process_line_by_line: bool = False

    @staticmethod
    def from_dict(config) -> BERTModelConfig:
        return from_dict(BERTModelConfig, config)


@dataclass
class BERTTrainingConfig(Config):
    """BERT training configuration

    Configuration settings for BERT training.

    Attributes:
        seed (int): random seed for reproducibility.

        label_smoothing_factor (float): label smoothing factor.  Applied label smoothing when the factor is non-zero.

        batch_size (int): batch size.

        max_seq_length (int): maximum input sequence length.
    """
    seed: int | None = None
    label_smoothing_factor: float | None = None
    batch_size: int | None = None
    max_seq_length: int | None = None

    @staticmethod
    def from_dict(config) -> BERTTrainingConfig:
        return from_dict(BERTTrainingConfig, config)


@dataclass
class BERTConfig(Config):
    """BERT configuration
    Specifies the model and training configuration for huggingface modeling scenarios.

    Attributes:
        loader_type (str): loader type hint. eg 'text'

        model (BERTModelConfig): BERT model configuration.

        training (BERTTrainingConfig): BERT training configuration.
    """
    loader_type: str = None
    model: BERTModelConfig = None
    training: BERTTrainingConfig = None

    @staticmethod
    def from_dict(config) -> BERTConfig:
        result = BERTConfig()
        for k in config:
            if k == 'model':
                result.model = BERTModelConfig.from_dict(config[k])
            elif k == 'training':
                result.training = BERTTrainingConfig.from_dict(config[k])
            else:
                setattr(result, k, config[k])
        return result


@dataclass
class PrivacyConfig(Config):
    """Privacy configuration

    The privacy configuration specified differential privacy settings for the model.
    The user can choose between local or global DP.  When local DP is enabled, a global
    epsilon can be computed by applying the RDP accountant (see extensions/privacy).
    The `eps` parameter is used to specify the privacy budget for local DP.  Conversely, when
    global DP is enabled, `eps` is ignored and `global_sigma` directly specifies the global
    Gaussian noise.   `max_grad` specifies the clipping parameter for local or global DP,
    `max_weight` specifies the clipping parameter for the local gradient aggregation weight
    (applies to softmax aggregation), and `weight_scaler` indicates how the aggregation weight
    is scaled before noise addition, and unscaled afterward. This enables a single eps/sigma
    parameter for both the gradient and its weight.

    Example:
       This example applies local DP with eps=1000. The global epsilon will be computing using Renyi DP accounting.

       .. code-block:: yaml

            dp_config:
                # Local dp clips and adds noise on the client and centrally accumulates the privacy budget.
                enable_local_dp: true
                eps: 100 # epsilon
                max_grad: 0.008  # max gradient
                # The max_weight and min_weight should be already scaled by weight_scaler
                # Because we scale down the weight using weight_scalar -> clip -> add noise -> scale back up.
                max_weight: 0.0001
                weight_scaler: 0.0001
                min_weight: 0.00009


    Attributes:
        enable_local_dp (bool): whether to enable local DP.

        enable_global_dp (bool): whether to enable global DP.

        eps (float): the privacy budget for local DP.

        delta (float): the privacy delta parameter for local DP.

        global_sigma (float): the global Gaussian noise for global DP.

        max_grad (float): the gradient clipping parameter.

        max_weight (float): the aggregation weight clipping parameter.

        weight_scaler (float): the aggregation weight scaling parameter.

        min_weight (float): the minimum per-gradient aggregation weight.

    """
    enable_local_dp: bool = False
    enable_global_dp: bool = False
    eps: float | None = None
    delta: float | None = None
    global_sigma: float | None = None
    max_grad: float | None = None
    max_weight: float | None = None
    weight_scaler: float | None = None
    min_weight: float | None = None

    @staticmethod
    def from_dict(config) -> PrivacyConfig:
        return from_dict(PrivacyConfig, config)


@dataclass
class PrivacyMetricsConfig(Config):
    """Privacy metrics configuration

    This optional feature computes local privacy metrics for computed gradients,
    and optionally filters gradients based on estimated privacy loss.

    Attributes:
        apply_metrics (bool): whether to compute privacy metrics.

        apply_indices_extraction (bool): whether to attempt local data reconstruction.

        allowed_word_rank (int): threshold for successful reconstruction.

        apply_leakage_metric (bool): whether to compute a privacy leakage metric based on the ratio of perplexities before and after local training.

        max_leakage (float): the maximum allowed privacy leakage before filtering

        adaptive_leakage_threshold (float): if non-zero, compute an adaptive leakage threshold based on the previous round of training.  For example at 0.95, the max_leakage will be adjusted to reject 5% of gradients, based on the previous round of training.

        is_leakage_weighted (bool): scales the leakage by the maximum likelihood of the pre- and post- likelihood tensors. ie the worst-case leakage is weighted by the worst-case likelihood that we might encounter it.

        attacker_optimizer_config (OptimizerConfig): the optimizer configuration for the reconstruction attack.
    """
    apply_metrics: bool = False
    apply_indices_extraction: bool = False
    allowed_word_rank: int | None = None
    apply_leakage_metric: bool = False
    max_leakage: float | None = None
    max_allowed_leakage: float | None = None
    adaptive_leakage_threshold: float | None = None
    is_leakage_weighted: bool = False
    attacker_optimizer_config: OptimizerConfig = None

    @staticmethod
    def from_dict(config) -> PrivacyMetricsConfig:
        result = PrivacyMetricsConfig()
        for k in config:
            if k == 'attacker_optimizer_config':
                result.attacker_optimizer_config = \
                    OptimizerConfig.from_dict(config[k])
            else:
                setattr(result, k, config[k])
        return result


@dataclass
class OptimizerConfig(Config):
    """Optimizer configuration

    Pass any pytorch-supported optimizer configuration. The object should include
    a `type` field which indicates the pytorch optimizer type that should be invoked.
    This will be stripped from the object before being passed to the Optimizer's init.
    """
    type: str = None
    # Leave this open for any keyword arguments, so we don't break torch constructors
    # In the future we can limit keywords to torch-specific ones.
    # lr: float = 0.0
    # weight_decay: float = 0.0
    # amsgrad: bool = False

    @staticmethod
    def from_dict(config) -> OptimizerConfig:
        # needs its own from_dict so we can accomodate any fields
        result = OptimizerConfig()
        assert 'type' in config
        for k in config:
            setattr(result, k, config[k])
        return result


@dataclass
class AnnealingConfig(Config):
    """Learning rate annealing configuration


    Attributes:
        type (str): the type of annealing. Supported methods: :code:`step_lr`, :code:`multi_step_lr`, :code:`rampup-keep-expdecay-keep`, :code:`val_loss`.

        step_interval (str): the interval at which to step the learning rate. Supported intevals: :code:`epoch`, :code:`batch`.

        gamma (float): the learning rate decay factor.

        step_size (int): the interval between annealing operations.
    """
    type: str = None
    step_interval: str = None
    gamma: float | None = None
    step_size: int | None = None

    @staticmethod
    def from_dict(config) -> AnnealingConfig:
        return from_dict(AnnealingConfig, config)


@dataclass
class DatasetConfig(Config):
    # Common to all text (NLG, MLM) dataloaders
    batch_size: int | None = None
    loader_type: str = None
    prepend_datapath: bool = False
    num_workers: int | None = None
    desired_max_samples: int | None = None

    # Common to all client.train dataloaders
    list_of_train_data: str = None
    max_grad_norm: float | None = None  # propose moving max_grad_norm to client config

    # Common to all server.train dataloaders. What is the difference?
    train_data: str = None
    train_data_server: str = None

    # Common to server.test dataloaders
    test_data: str = None

    # Common to server.val dataloaders
    val_data: str = None

    # Specific to NLG dataloaders
    tokenizer_type: str = None  # Note tokenizer_type appears in NLG configs, but always set to 'not applicable'
    vocab_dict: str = None
    pin_memory: bool = False
    num_frames: int | None = None  # num_frames is missing from NLG server.test dataloader
    max_batch_size: int | None = None
    max_num_words: int | None = None
    unsorted_batch: int | None = None
    utterance_mvn: bool = False  # only present on NLG client.train dataloader

    # Specific to MLM dataloaders
    task: str = None
    mlm_probability: float | None = None
    tokenizer_type_fast: bool = False
    max_seq_length: int | None = None
    min_words_per_utt: int | None = None
    max_samples_per_user: int | None = None
    mask_token: str = None
    cache_dir: str = None

    @staticmethod
    def from_dict(config) -> DatasetConfig:
        return from_dict(DatasetConfig, config)


@dataclass
class DataConfig(Config):
    """Data configurations

    Client and server configs may each contain a data config, consisting of train, test, and validate datasets.
    A typical configuration will define test and validate in the server data config, while the training data is defined in the client config.
    Optionally, the server can have a training config which defines server-side training data.


    Attributes:
        train (DatasetConfig): the training dataset configuration.

        val (DatasetConfig): the validation dataset configuration.

        test (DatasetConfig): the test dataset configuration.
    """
    train: DatasetConfig = None
    val: DatasetConfig = None
    test: DatasetConfig = None

    @staticmethod
    def from_dict(config) -> DataConfig:
        train = DatasetConfig.from_dict(config['train']) if 'train' in config else None
        val = DatasetConfig.from_dict(config['val']) if 'val' in config else None
        test = DatasetConfig.from_dict(config['test']) if 'test' in config else None

        return DataConfig(
            train, val, test
        )


@dataclass
class ServerReplayConfig(Config):
    """Server replay configuration

    When server-side training data is defined, this config defines how it is applied after each client training round.

    Attributes:
        server_iterations (int): the number of iterations to run over server-side training data for.

        optimizer_config (OptimizerConfig): the optimizer configuration to use for the server.
    """
    server_iterations: int
    ignore_subtask: bool
    optimizer_config: OptimizerConfig

    @staticmethod
    def from_dict(config) -> ServerReplayConfig:
        return ServerReplayConfig(
            config['server_iterations'],
            config['ignore_subtask'],
            OptimizerConfig.from_dict(config['optimizer_config'])
        )


@dataclass
class RLConfig(Config):
    """Reinforcement learning configuration

    RL can be applied during dynamic gradient aggregation to speed up convergence. This configuration defines the settings for server-side RL to train the model for DGA.

    Attributes:
        marginal_update_RL (bool): whether to update the RL model when the loss is small.

        RL_path (str): the path to the RL model to train.

        RL_path_global (bool): whether the global training output path should be prepended to RL_path.

        model_descriptor_RL (str): string to append to the model filename.

        network_params (list): List of layer widths in the RL network. eg: 300,128,128,128,64,100

        initial_epsilon (float): the initial epsilon value for the epsilon-greedy policy.

        final_epsilon (float): the final epsilon value for the epsilon-greedy policy.

        epsilon_gamma (float): the decay rate for the epsilon-greedy policy.

        max_replay_memorize_size (int): the maximum number of samples to store in the replay memory.

        minibatch_size (int): the size of the minibatch to use for training.

        gamma (float): the discount factor for the RL model.

        optimizer_config (OptimizerConfig): the optimizer configuration to use for the RL model.

        annealing_config (AnnealingConfig): the annealing configuration to use for the RL model.


    """
    marginal_update_RL: bool = False
    RL_path: str = None
    RL_path_global: bool = False
    model_descriptor_RL: str = None
    network_params: list = None
    initial_epsilon: float | None = None
    final_epsilon: float | None = None
    epsilon_gamma: float | None = None
    max_replay_memory_size: int | None = None
    minibatch_size: int | None = None
    gamma: float | None = None
    optimizer_config: OptimizerConfig = None
    annealing_config: AnnealingConfig = None

    @staticmethod
    def from_dict(config) -> RLConfig:
        result = RLConfig()
        for k in config:
            if k == 'optimizer_config':
                result.optimizer_config = OptimizerConfig.from_dict(config[k])
            elif k == 'annealing_config':
                result.annealing_config = AnnealingConfig.from_dict(config[k])
            else:
                setattr(result, k, config[k])
        return result


@dataclass
class ServerConfig(Config):
    """Server configuration

    The server configuration defines the server-side settings.

    Attributes:
        resume_from_checkpoint (bool): whether to resume training from a checkpoint.

        max_iterations (int): the maximum number of iterations (federated training rounds) to run.

        num_clients (int): the number of clients to use per training round.

        optimizer_config (OptimizerConfig): the optimizer configuration to use server-side.

        annealing_config (AnnealingConfig): the learning rate annealing configuration to use server-side.

        val_freq (int): the number of iterations between validation evaluation runs.

        rec_freq (int): the number of iterations between test evaluation runs.

        initial_val (bool): whether to run validation before initiating training.

        initial_rec (bool): whether to run test before initiating training.

        wantRL (bool): whether to train the RL model.

        RL (RLConfig): the RL configuration to use if wantRL is True.

        data_config (DataConfig): the data configuration to use server-side.

        type (str): the type of server. Currently this parameter is ignored and OptimizationServer is always used. However there is some validation code that checks for one of the following values:

            - model_averaging
            - optimization
            - model_optimization
            - cluster_finetuning
            - cluster_parallel

        aggregate_median (str): the aggregation method to use (DGA softmax, or mean). Note that this only applies when the global aggregation strategy is DGA.

        weight_train_loss (str): when softmax DGA is enabled, what metric to use for weighting. One of

            - train_loss
            - mag_var_loss
            - mag_mean_loss

        softmax_beta (float): the beta value to use for the softmax DGA.

        max_weight (float): the maximum allowed client weight.

        initial_lr_client (float): the initial learning rate for each client.

        lr_decay_factor (float): the client learning rate decay factor.

        best_model_criterion (str): The metric to choose when resetting to the best model so far.

        server_replay_config (ServerReplayConfig): the server replay configuration to use for any server-side training.

    """
    resume_from_checkpoint: bool = False
    max_iteration: int | None = None
    num_clients_per_iteration: int | None = None
    optimizer_config: OptimizerConfig = None
    annealing_config: AnnealingConfig = None
    val_freq: int | None = None
    rec_freq: int | None = None
    initial_val: bool = True
    initial_rec: bool = True
    wantRL: bool = False
    RL: RLConfig = None
    data_config: DataConfig = None
    type: str = None
    aggregate_median: str = None
    weight_train_loss: str = None
    softmax_beta: float | None = None
    max_weight: float | None = None
    initial_lr_client: float | None = None
    lr_delay_factor: float | None = None
    best_model_criterion: str = 'loss'
    server_replay_config: ServerReplayConfig = None

    @staticmethod
    def from_dict(config) -> ServerConfig:
        result = ServerConfig()

        for k in config:
            if k == 'optimizer_config':
                result.optimizer_config = \
                    OptimizerConfig.from_dict(config[k])
            elif k == 'annealing_config':
                result.annealing_config = \
                    AnnealingConfig.from_dict(config[k])
            elif k == 'data_config':
                result.data_config = \
                    DataConfig.from_dict(config[k])
            elif k == 'server_replay_config':
                result.server_replay_config = \
                    ServerReplayConfig.from_dict(config[k])
            elif k == 'RL':
                result.RL = \
                    RLConfig.from_dict(config[k])
            else:
                setattr(result, k, config[k])
        return result


@dataclass
class ClientConfig(Config):
    """
    Client configuration

    The client configuration defines the client-side settings.

    Attributes:
        meta_learning (str): Set to 'basic'.  Currently ignored.

        stats_on_smooth_grad (bool): When true, gradient statistics are reset each round. Currently, it appears these statistics aren't used.

        ignore_subtask (bool): Used to determine which model loss to use. In most cases just set to False.

        num_skips_threshold (int): previously used to skip users, deprecated.

        copying_train_data (bool): has no effect.

        do_profiling (bool): whether to enable client-side profiling.

        data_config (DataConfig): the data configuration to use client-side.

        type (str): the type of client. Currently this parameter is ignored?

        meta_optimizer_config (OptimizerConfig): the optimizer configuration to use for meta-learning.

        optimizer_config (OptimizerConfig): the optimizer configuration to use for client-side training.

        annealing_config (AnnealingConfig): the learning rate annealing configuration to use client-side.
    """
    meta_learning: str = None
    stats_on_smooth_grad: bool = False
    ignore_subtask: bool = False
    num_skips_threshold: int | None = None
    copying_train_data: bool = False
    do_profiling: bool = False
    data_config: DataConfig = None
    type: str = None
    meta_optimizer_config: OptimizerConfig = None
    optimizer_config: OptimizerConfig = None
    annealing_config: AnnealingConfig = None

    @staticmethod
    def from_dict(config) -> ClientConfig:
        result = ClientConfig()
        for k in config:
            if k == 'data_config':
                result.data_config = DataConfig.from_dict(config[k])
            elif k == 'meta_optimizer_config':
                result.meta_optimizer_config = \
                    OptimizerConfig.from_dict(config[k])
            elif k == 'optimizer_config':
                result.optimizer_config = \
                    OptimizerConfig.from_dict(config[k])
            elif k == 'annealing_config':
                result.annealing_config = \
                    AnnealingConfig.from_dict(config[k])
            else:
                setattr(result, k, config[k])
        return result


@dataclass
class FLUTEConfig(Config):
    """
    FLUTEConfig represents the global configuration for a training job.

    Attributes:
        model_config (ModelConfig): the model configuration to use.

        dp_config (PrivacyConfig): differential privacy configuration.

        strategy (str): Aggregation strategy, eg DGA or FedAvg.

        server_config (ServerConfig): the server configuration to use.

        client_config (ClientConfig): the client configuration to use.

    """
    model_config: ModelConfig = None
    dp_config: PrivacyConfig = None
    privacy_metrics_config: PrivacyMetricsConfig = None
    strategy: str = None
    server_config: ServerConfig = None
    client_config: ClientConfig = None

    def validate(config):

        # Join paths in config file
        if config["server_config"]["wantRL"]:
            rl_path = config["server_config"]["RL"]["RL_path"]
            rl_path = os.path.join(config["output_path"],rl_path) if config["server_config"]["RL"].get("RL_path_global", True) \
                                                            else os.path.join(config["output_path"], config["experiment_name"],rl_path)

        if "pretrained_model_path" in config["model_config"]:
            config["model_config"]["pretrained_model_path"] = os.path.join(config["data_path"], config["model_config"]["pretrained_model_path"])

        for section in ["server_config", "client_config"]:
            for mode in ['test','val','train']:
                if mode in config[section]["data_config"] and "vocab_dict" in config[section]["data_config"][mode]:
                    config[section]["data_config"][mode]["vocab_dict"] = os.path.join(config['data_path'], config[section]["data_config"][mode]["vocab_dict"])
                
                # TODO: Remove BERT specific parameters
                if 'BERT' in config['model_config']:
                    if mode!= 'train':
                        config['server_config']['data_config'][mode]['model_name_or_path'] = config['model_config']['BERT']['model']['model_name']
                        config['server_config']['data_config'][mode]['process_line_by_line'] = config['model_config']['BERT']['model']['process_line_by_line']
                    else:
                        config['client_config']['data_config'][mode]['model_name_or_path'] = config['model_config']['BERT']['model']['model_name']
                        config['client_config']['data_config'][mode]['process_line_by_line'] = config['model_config']['BERT']['model']['process_line_by_line']
        return config

    @staticmethod
    def from_dict(config) -> FLUTEConfig:

        # Validate schema in config file
        schema = eval(open('./core/schema.py', 'r').read())
        v = Validator(schema)
        if not v.validate(config,schema):
            raise ValueError('Missing {} argumment in config file '.format(v.errors))
        
        # Normalize default values
        original_config = config
        config = v.normalized(config)

        for section in ['server_config', 'client_config']:
            for mode in config[section]['data_config'].keys():
                diff = config[section]['data_config'][mode].keys() - original_config[section]['data_config'][mode].keys()
                if len(diff) > 0:
                    print_rank("Assigning default values for: {} in [{}][{}][data_config]".format(diff, section, mode))
        
        dp_config = \
            PrivacyConfig.from_dict(config['dp_config']) \
            if 'dp_config' in config else None

        priv_metrics_config = \
            PrivacyMetricsConfig.from_dict(config['privacy_metrics_config']) \
            if 'privacy_metrics_config' in config else None

        strategy = config.get('strategy', 'DGA')

        return FLUTEConfig(
            ModelConfig.from_dict(config['model_config']),
            dp_config, priv_metrics_config, strategy,
            ServerConfig.from_dict(config['server_config']),
            ClientConfig.from_dict(config['client_config'])
        )


================================================
FILE: core/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from torch.utils.data import DataLoader as PyTorchDataLoader
from abc import ABC

class BaseDataLoader(ABC, PyTorchDataLoader):
    '''This is a wrapper class for PyTorch dataloaders.'''

    def create_loader(self):
        '''Returns the dataloader'''
        return self

        
================================================
FILE: core/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from torch.utils.data import Dataset as PyTorchDataset
from abc import ABC, abstractmethod

class BaseDataset(ABC, PyTorchDataset):
    '''This is a wrapper class for PyTorch datasets.'''

    @abstractmethod
    def __init__(self,**kwargs):
        super(BaseDataset, self).__init__()
        
    @abstractmethod
    def __getitem__(self, idx, **kwargs):
        '''Fetches a data sample for a given key'''
        pass
    
    @abstractmethod
    def __len__(self):
        '''Returns the size of the dataset'''
        pass
    
    @abstractmethod
    def load_data(self,**kwargs):
        '''Wrapper method to read/instantiate the dataset'''
        pass


================================================
FILE: core/evaluation.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
In this file we define the functions for running
test and validation tasks inside the Server.
'''

import logging
import torch
import numpy as np

# Internal imports
import core.federated as federated
from core.client import Client
from utils import print_rank

# AzureML-related libs
from azureml.core import Run
run = Run.get_context()

class Evaluation():

    def __init__(self, config, model_path, process_testvalidate, idx_val_clients, idx_test_clients, single_worker):

        self.config = config
        self.model_path = model_path
        self.process_testvalidate = process_testvalidate
        self.server_type = config['server_config']['type']
        self.idx_val_clients = idx_val_clients
        self.idx_test_clients = idx_test_clients
        self.send_dicts = config['server_config'].get('send_dicts', False)
        self.single_worker = single_worker
        super().__init__()
    
    def run(self, eval_list, req, metric_logger=None):
        '''Run test/validation taks depending on the modes
        received in the eval_list.
        
        Args:
            eval_list (arr): Contains the tasks to run.
            req (dict): information for test/val tasks
            metric_logger (callback, optional): callback used for logging.
                Defaults to None, in which case AML logger is used.
        '''      
        
        self.worker_trainer = req['worker_trainer']
        if self.send_dicts:
            global_model_values = [self.worker_trainer.model.state_dict()[param_key].to(torch.device('cpu')) for param_key in self.worker_trainer.model.state_dict()]
        else:
            global_model_values = [p.data.to(torch.device('cpu')) for p in self.worker_trainer.model.parameters()]

        if 'tmp_unsup' in req:
            unsup_values = req['tmp_unsup'].values()
            sup_values = req['tmp_sup'].values()
            semisupervision_inference = True
        else:
            semisupervision_inference = False

        save_model = False 
        
        if metric_logger is None:
            metric_logger = run.log

        for mode in eval_list:

            # Skipping validation round when RL is enabled
            if 'wantRL' in self.config['server_config'] and self.config['server_config']['wantRL'] and mode == "val":
                continue
            
            # Compute avg_loss and avg_acc
            self.metrics = self.run_distributed_inference(mode, global_model_values)
            req = self.initialize_req(req) if len(req) == 1 else req

            # Only if for semisupervision
            if semisupervision_inference:
                unsup_metrics = self.run_distributed_inference(mode, unsup_values)
                sup_metrics = self.run_distributed_inference(mode, sup_values)

                for key, value in unsup_metrics.items():
                    metric_logger(str("Unsup" +mode + " " + key).capitalize(), value['value'])
                    print_rank('LOG UNSUP: {}_{}={}'.format(mode, key, value['value']))
                
                for key, value in sup_metrics.items():
                    metric_logger(str("Sup" + mode + " " + key).capitalize(), value['value'])
                    print_rank('LOG SUP: {}_{}={}'.format(mode, key, value['value']))

            # Log metrics
            for key, value in self.metrics.items():
                metric_logger(str(mode + " " + key).capitalize(), value['value'])
                print_rank('LOG: {}_{}={}: best_{}_{}={}'.format(mode, key, value['value'], mode, key, req[str("best_"+ mode + "_" + key)]))

            for key,value in self.metrics.items():
                attr = str("best_"+ mode + "_" + key)
                if value['higher_is_better']:
                    if self.metrics[key]['value'] > req[attr]: 
                        req[attr] = self.metrics[key]['value']
                        save_model = True
                else:
                    if self.metrics[key]['value'] < req[attr]:
                        req[attr] = self.metrics[key]['value']
                        save_model = True
                
                if save_model and mode == 'val':
                    self.worker_trainer.save(
                        model_path=self.model_path,
                        token=str('best_'+ mode +'_'+key),
                        config=self.config['server_config']
                    )
                    save_model = False
        
        return req
    
    def initialize_req(self, req):
        '''Update the keys, to have the same as metrics dictionary. This 
        function is only used during itr=0 for initializing the req 
        dictionary. 

        Args:
            req (dict): Best results for all the metrics (e.g. best_val_acc).
        '''
        for mode in ['test','val']:
            for key in self.metrics.keys():
                attr = "best_"+ mode + "_" + key 
                req[attr] = -1.0 if self.metrics[key]['higher_is_better'] else float('inf')

        return req

    def run_distributed_inference(self, mode, model):
        '''Call `run_distributed_evaluation` specifically for test or validation.
        
        This is just a helper function that fetches the clients depending on
        the mode and calls `run_distributed_evaluation` using that list.

        Args:
            mode (str): `test` or `val`.
        '''
        if mode == 'val':
            clients = self.idx_val_clients
        elif mode == 'test':
            clients = self.idx_test_clients
        else:
            raise NotImplementedError('Unsupported mode: {}'.format(mode))

        return self.run_distributed_evaluation(mode, clients, model)

    def run_distributed_evaluation(self, mode, clients, model):
        '''Perform evaluation using available workers.

        See also `process_test_validate` on federated.py.

        Args:
            mode (str): `test` or `val`.
            clients (list): clients for test/val round.
        '''

        total = 0
        self.logits = {'predictions': [], 'probabilities': [], 'labels': []}
        server_data = (0.0, model, 0)
        for result in self.process_testvalidate(clients, server_data, mode, self.single_worker):
            output, metrics, count = result
            val_metrics =  {key: {'value':0, 'higher_is_better': False} for key in metrics.keys()} if total == 0 else val_metrics
 
            for key in val_metrics:
                val_metrics[key]['value'] += metrics[key]['value']* count
                val_metrics[key]['higher_is_better'] = metrics[key]['higher_is_better']
            total+= count
            
            if output is not None:
                self.logits['predictions'].append(output['predictions'])
                self.logits['probabilities'].append(output['probabilities'])
                self.logits['labels'].append(output['labels'])

        if  self.logits['probabilities'] and self.logits['predictions'] and self.logits['labels']:
            self.logits['predictions'] = np.concatenate(self.logits['predictions'])
            self.logits['probabilities'] = np.concatenate(self.logits['probabilities'])
            self.logits['labels'] = np.concatenate(self.logits['labels'])

        
        for key in val_metrics:
                val_metrics[key]['value'] = val_metrics[key]['value']/total
            
        self.losses = [val_metrics['loss']['value'], val_metrics['acc']['value']] # For compatibility with Server
        return val_metrics

def make_eval_clients(dataset, config):
    '''Generator that yields clients for evaluation, continuously.

    Args:
        dataset (torch.utils.data.Dataset): used to get client's data
        config (dict): used for the client's constructor
    '''

    total = sum(dataset.num_samples)
    clients = federated.size() - 1 if federated.size()>1 else federated.size()
    delta = total / clients + 1
    threshold = delta
    current_users_idxs = list()
    current_total = 0

    if config["server_config"]["type"] == "personalization":  
        for i in range(len(dataset.user_list)):
            yield Client([i], config, False)
    else:
        for i in range(len(dataset.user_list)):
            current_users_idxs.append(i)
            count = dataset.num_samples[i]
            current_total += count
            if current_total > threshold:
                print_rank(f'sending {len(current_users_idxs)} users', loglevel=logging.DEBUG)
                yield Client(current_users_idxs, config, False)
                current_users_idxs = list()
                current_total = 0

        if len(current_users_idxs) != 0:
            print_rank(f'sending {len(current_users_idxs)} users -- residual', loglevel=logging.DEBUG)
            yield Client(current_users_idxs, config, False)


================================================
FILE: core/federated.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import os
import cProfile
import logging
import threading 

import torch
import torch.distributed as dist
import numpy as np

from core.client import Client
from utils import (
    print_rank,
    print_profiler,
    to_device,
)

COMMAND_UPDATE = 0
COMMAND_TRAIN = 1
COMMAND_TERMINATE = 10
COMMAND_TESTVAL = 11
COMMAND_SYNC_NODES = 9
GLOBAL_MESSAGE = None

def encode_string(word, string_to_int = True):
    """ Encodes/Decodes the dictionary keys into an array of integers to be sent 
    as tensors of the same shape during NCCL/Gloo P2P communication.
    
    Args:
            word (string/array): key to be encoded/decoded.
            string_to_int (bool): flag that indicates which action to perform.
    """

    if string_to_int: # encode
        word = word.ljust(8, ' ') if len(word) < 8 else word # padding -- 8 is max length, all tensors must have the same size during communication
        word_encoded = [letter for letter in word.encode()]
        return word_encoded
    else: #decode
        cleanup_array = [letter for letter in word if letter!= 32] # Remove padding
        word_decoded = bytes(cleanup_array).decode()
        return word_decoded

def rank():
    """ Return rank of node. """
    return int(os.environ['RANK'])

def local_rank():
    """ Return local rank of node. """
    return int(os.environ['LOCAL_RANK'])

def size():
    """ Returns number of nodes in the distributed group, including server. """
    return int(os.environ['WORLD_SIZE'])

def _recv(x, src=0):
    """ Receives tensors with a single element or a list of tensors 
    with the same shape during distributed communication. """

    x = torch.tensor(x) if torch.is_tensor(x) == False else x
    x = to_device(x)
    dist.recv(tensor=x, src=src)
    x.to('cpu')
    
    try:
        return x.item() # single element
    except:
        return x.tolist() # list of tensors

def _recv_gradients(src):
    """ Receives a list of tensors with different shape during 
    distributed communication. """

    n, n_dimensions, grads = 0, 0, [] # tensors intialization -- required by torch.
    n = _recv(n,src)
    for i in range(n):
        n_dimensions = _recv(n_dimensions,src)
        dimensions = [0 for i in range(n_dimensions)]
        dimensions = _recv(dimensions, src)
        print_rank(f"Received dimensions {dimensions}", loglevel=logging.DEBUG)
        param = to_device(torch.zeros(dimensions))
        print_rank(f"Shape assigned {param.shape}", loglevel=logging.DEBUG)
        dist.recv(param,src)
        grads.append(param.detach().cpu())
    torch.cuda.empty_cache() 
    return grads

def _send(x, dst=0):
    """ Send tensors with a single element or a list of tensors 
    with the same shape during distributed communication. """
    x = torch.tensor(x)
    x = to_device(x)
    dist.send(x, dst)
    del x 
    torch.cuda.empty_cache()

def _send_metrics(output):
    """ Organize the keys and values from the resulting dictionary 
    from test/val rounds into arrays that are sent as independent 
    tensors during distributed communication. """

    keys = [encode_string(key) for key in output.keys()]
    values = [float(output[key]['value']) for key in output.keys()]
    higher_is_better = [int(output[key]['higher_is_better']) for key in output.keys()] # send the boolean as int

    _send(len(keys),0) 
    _send(keys)
    _send(values)
    _send(higher_is_better)

def _send_gradients(gradients, dst):
    """ Send a list of tensors with different shape during 
    distributed communication. """

    _send(len(gradients), dst)
    for i in gradients:
        dimensions = [int(d) for d in i.shape]
        _send(len(dimensions),dst)
        _send(dimensions,dst)
        param = to_device(i)
        dist.send(param,dst)
        del param 
        torch.cuda.empty_cache()

def _send_train_output(output):
    """ Organize the keys and values from the the returning ´client_output´ 
    dictionary in ´Client.proces_round()´ function during training rounds,
    into arrays that are sent as independent tensors during distributed 
    communication. """

    cs_values = [float(cs_v) for cs_v in output['cs'].values()] # cs dict -- values are flatten in 1d array
    pl_values = [float(output['pl']['weight'])] # pl dict
    gradients = output['pl']['gradients'] # gradients are sent independently

    if len(output.keys()) > 9: # DP metrics
        ps_values = [float(ps_v) for ps_v in output['ps'].values()]
        values = cs_values + [float(output[key]) for key in output.keys() if key not in ['cs','pl','ps']] + pl_values + ps_values # reorganizing values in the order expected by the Server
    else:
        values = cs_values + [float(output[key]) for key in output.keys() if key not in ['cs','pl']] + pl_values # reorganizing values in the order expected by the Server
    
    # Send data
    _send(int(len(output.keys())),0) # Warn for number of keys
    _send(values, 0)
    _send_gradients(gradients, 0)

def build_grads_dict(node):
    """ Reconstruct the dictionary ´client_output´ returned by 
    ´Client.proces_round()´ function on the Server side during 
    distributed communication. """

    # Initialize tensors
    n_keys = 0
    n_keys = _recv(n_keys,node)
    print(n_keys)

    if n_keys == 9:
        keys = ['cs','tl','mg','vg','ng','rg','ns','ts','pl']
        values = [0.0 for i in range(11)] # initializing tensor shape -- 11 is fixed number of keys expected
    elif n_keys == 10:
        keys = ['cs','tl','mg','vg','ng','rg','ns','ts','pl','ps']
        values = [0.0 for i in range(15)] # When the privacy metrics are enabled
    elif n_keys == 11:
        keys = ['cs','tl','mg','vg','ng','rg','ns','wt','ts','pl','ps']
        values = [0.0 for i in range(16)] # When the privacy metrics are enabled
    
    # Read data
    values = _recv(values,node)
    grads = _recv_gradients(node)
    
    cs_values = [{key: values.pop(0) for key in ['setup','training','full cost']}] # recreating cs dict
    # Rebuilding original dictionary
    if n_keys == 9:
        pl_values = [{'weight':values.pop(), 'gradients': grads}] # recreating pl dict
        values_list = cs_values + [values.pop(0) for i in range(7)] + pl_values # 7 is fixed length for remaining items
    else:
        ps_values = [{key: values.pop() for key in ['Practical epsilon (Max leakage)','Words percentage above 9000 word rank','Extracted indices percentage','Dropped clients']}]
        pl_values = [{'weight':values.pop(), 'gradients': grads}] # recreating pl dict
        values_list = cs_values + [values.pop(0) for i in range(len(values))] + pl_values + ps_values

    result = dict(zip(keys,values_list))

    # Cast values to original type
    for key in ['mg','vg','ng','rg']:
        result[key] = np.float32(result[key])
    result['ns'] = int(result['ns'] )
                
    return result

def build_metrics_dict(node):
    """ Reconstruct the dictionary returned during test/val rounds
    on the Server side during distributed communication. """

    # Initialize tensors
    n = 0
    n = _recv(n,node)
    keys = [[0 for j in range(8)] for i in range(n)] # max_seq_len for metric name is 8
    values = [0.0 for i in range(n)]
    higher_is_better = [0 for i in range(n)]

    # Read data
    keys = _recv(keys,node)
    values = _recv(values,node)
    higher_is_better = _recv(higher_is_better,node)

    # Reorganize output + decode dict keys
    orig_keys = [encode_string(key, string_to_int=False) for key in keys]
    values_dict = [{'value': float(v), 'higher_is_better': bool(higher_is_better[i])} for i, v in enumerate(values)]
    metrics = dict(zip(orig_keys,values_dict))
    num_instances = int(metrics.pop('num')['value'])

    result = None, metrics, num_instances
            
    return result

def receive_workers_output(node_request_map, results_list, free_nodes, command, idle_nodes):
    """ Receives the clients output on the Server side in async/sync mode. 
    Asynchronous mode is only enabled when using NCCL backend given that Gloo 
    does not provide native non-blocking implementation to check if the operation 
    has been completed during distributed training"""

    if dist.get_backend() == "nccl": # Async
        for node, req in node_request_map:
            if req.is_completed():
                result = build_metrics_dict(node) if command == COMMAND_TESTVAL else build_grads_dict(node)
                results_list.append(result)
                free_nodes.append(node)
                node_request_map.remove((node,req))
                print_rank(f"Finished releasing the nodes {free_nodes}", loglevel=logging.DEBUG)
    else: # Sync
        print_rank(f"Waiting for a workers", loglevel=logging.DEBUG)
        gather_objects = [(None,None,None) for i in range(size())]
        output = [None for _ in gather_objects]
        dist.all_gather_object(output, gather_objects[rank()])
        print_rank(f" All workers have finished ... taking the remaining clients {len(output)}", loglevel=logging.DEBUG)
        output = [e for i,e in enumerate(output) if i not in idle_nodes ] # Cleanup for idle workers
        results_list = results_list + output[1:]
        free_nodes = list(range(1, size()))
    
    return node_request_map, results_list, free_nodes

def append_async_requests(node_request_map, node):
    """ Appends the asynchronous request sent to each worker during 
    asynchronous training. """

    ack = to_device(torch.tensor(1))
    req = dist.irecv(tensor=ack, src=node)
    node_request_map.append((node,req))
    return node_request_map

def sync_idle_nodes(client_queue, free_nodes):
    """ Request dummy outputs to the odd (idle) nodes during synchronous training
    to prevent them to get trapped in the state of the previous iterations """

    idle_nodes = []
    if len(client_queue) == 0:
        print_rank(f"Free idle nodes {len(free_nodes)}", loglevel=logging.DEBUG)
        while len(free_nodes) > 0:
            node = free_nodes.pop()
            idle_nodes.append(node)
            _send(COMMAND_SYNC_NODES, node)
    return idle_nodes

class Server:
    """Server object responsible for orchestration and aggregation.

    The Server is one of the two objects that may exist inside of a thread, all
    throughout its execution (the other being the Worker). At every round, the
    Server samples clients ids and send their data for an available Worker to process.
    The Workers then each produce a new model, and all models are sent to the Server
    for aggregation.

    The methods defined here are related to orchestration only, the aggregation
    will be done by a different object which inherits from this one.

    Notes:
        This class has no :code`__init__` method, and all its methods are static.
        It thus only serves the purpose of grouping the methods, but nothing
        is actually stored inside of the object.
    """
    @staticmethod
    def dispatch_clients(clients, server_data, command, mode=None, do_profiling=False, single_worker=None):
        """Perform the orchestration between Clients and Workers.

        This function does the following:
            1. It sends the server_data to all workers
            2. For each available Worker:
                2a. It sends the index of the client to instantiate
                2c. It triggers the execution of the command on the
                    Client.
            3. Collect and return all client outputs.

        Notes:
            This function yields the gradients of different clients
            as they are received. Therefore, the order of the results generally
            does not correspond to the order of the clients.

            All commands used during Server-Worker communication must be 
            float/integers given that torch.distributed only allows to
            send/recv tensors.

        Args:
            clients (list): list of clients to be processed.
            server_data (dict): server data sent to the workers and passed to
                clients, typically includes the global model at that step.
            command (int): instruction for worker to execute on the Client.
            mode (int): test/val only provided during evaluation rounds.
            do_profiling (bool): enables profiler during comunication.
        
        Returns:
            Generator of results.
        """
        # Single GPU flag
        single_gpu = True if size()==1 else False
        print_rank(f"Single GPU flag Server: {single_gpu}", loglevel=logging.DEBUG)

        # Some cleanup
        torch.cuda.empty_cache()
        torch.cuda.synchronize() if torch.cuda.is_available() else None

        # Initialize communication profiler
        profiler = None
        if do_profiling:
            profiler = cProfile.Profile()
            profiler.enable()

        # Update lr + model parameters each round for all workers
        lr, model_params, nround = server_data
        if not single_gpu:
            for worker_rank in range(1, size()):
                _send(COMMAND_UPDATE, worker_rank)
                _send(lr,worker_rank)
                _send_gradients(model_params, worker_rank)
                _send(float(nround),worker_rank)
                print_rank(f"Finished sending lr {lr} and n_params {len(model_params)} to worker {worker_rank} - round {nround}", loglevel=logging.DEBUG)
                print_rank(f"Finished sending server_data to workers", loglevel=logging.DEBUG)
        
            client_queue = clients.copy()
            print_rank(f"Clients queue: {client_queue}", loglevel=logging.DEBUG)
            free_nodes = list(range(1, size()))
            results_list, node_request_map = [], []

            # Initiate computation for all clients
            while client_queue:
                print_rank(f"Clients queue: {client_queue}", loglevel=logging.DEBUG)
                assert len(free_nodes) > 0
                node = free_nodes.pop()
                index = len(client_queue)-1
                client_to_process = client_queue.pop(index) 
                print_rank(f"Sending client {index} to worker {node}", loglevel=logging.DEBUG)
                _send(command, node) # The command should indicate the worker which function to run on the client

                if command == COMMAND_TESTVAL:
                    _send(mode,node) # Only for test/val has a value
                    _send(index, node) # Worker receives the index of the client to pop
                elif command == COMMAND_TRAIN:
                    _send(client_to_process, node)
                print_rank(f"Finished assigning worker {node}, free nodes {free_nodes}", loglevel=logging.DEBUG)

                if dist.get_backend() == "nccl":
                    append_async_requests(node_request_map, node)
                    idle_nodes = None
                else:
                    idle_nodes = sync_idle_nodes(client_queue, free_nodes)
    
                # Waits until receive the output from all ranks
                if not free_nodes:
                    print_rank(f"Waiting for a workers, free nodes {free_nodes}, reqs_lst {node_request_map}", loglevel=logging.DEBUG)
                    while len(free_nodes) == 0:
                        node_request_map, results_list, free_nodes = receive_workers_output(node_request_map, results_list, free_nodes, command, idle_nodes)
                        for output in results_list:
                            yield output
                        results_list = []

            # Wait for all workers to finish
            while (len(node_request_map)) != 0:
                node_request_map, results_list, free_nodes = receive_workers_output(node_request_map, results_list, free_nodes, command, idle_nodes)

                for output in results_list:
                    yield output
                results_list = []
        else:
            # For a single-GPU execution, there is no P2P communication in the same GPU. Using threats to coordinate.
            
            global GLOBAL_MESSAGE
            GLOBAL_MESSAGE = server_data

            if command == COMMAND_TESTVAL:
                t1 = threading.Thread(target=single_worker.trigger_evaluate)
                t1.start()
                t1.join()
                yield GLOBAL_MESSAGE
            elif command == COMMAND_TRAIN:
                total_clients = clients.copy()
                
                for client_id in total_clients:
                    GLOBAL_MESSAGE = lr, model_params, nround, client_id
                    t1 = threading.Thread(target=single_worker.trigger_train)
                    t1.start()
                    t1.join()
                    result = GLOBAL_MESSAGE
                    yield result

        if do_profiling:
            profiler.disable()
            print_profiler(profiler)

        # Some cleanup
        torch.cuda.empty_cache()
        torch.cuda.synchronize() if torch.cuda.is_available() else None

    @staticmethod
    def process_clients(clients, server_data, single_worker):
        """Ask workers to perform training on Clients.

        Args:
            clients (list): list of clients indexes sampled by ´Server.py´ 
                            object per iteration.
            server_data (dict): dictionary containing model.

        Returns:
            Generator of results.
        """
        return Server.dispatch_clients(clients, server_data, COMMAND_TRAIN, single_worker=single_worker)

    @staticmethod
    def process_testvalidate(clients, server_data, mode, single_worker):
        """Ask workers to perform test/val on Clients.

        Args:
            clients (list): list of clients indexes for test/val rounds.
            server_data (dict): dictionary containing model.
            mode (str): test/val.

        Returns:
            Generator of results.
        """

        mode = [-2] if mode == "test" else [2]
        return Server.dispatch_clients(clients, server_data, COMMAND_TESTVAL, mode, single_worker=single_worker)

    @staticmethod
    def terminate_workers(terminate=True):
        """Terminate the execution of the workers."""

        if terminate:
            print_rank("Terminating worker processes")
            for worker_rank in range(1, size()):
                _send(COMMAND_TERMINATE, worker_rank)

class Worker:
    """Worker object responsible for instantiate Clients based on incoming data
    from the Server and perform train/eval functions on it.

    Each worker lives on a different NCCL/Gloo thread and is assigned to a different
    GPU. Via the :code:`dispatch_clients` function, the Server passes to the
    Worker specific instructions to process clients' data, typically in order
    to generate a new model or to compute metrics.

    Attributes:
        model (torch.nn.Module): model being trained.
        data_path (str): path where all clients' data is located.
        do_profiling (bool): if True, analyzes execution in depth.
        val_clients (list): clients list for validation rounds.
        test_clients (list): clients list for testing rounds.
        config (dict): clients configuration.
        val_dataset (torch.utils.data.Dataset): validation dataset.
        test_dataset (torch.utils.data.Dataset): testing dataset.
    """
    def __init__(self, model=None, data_path=None, do_profiling=False, val_clients= None, \
                test_clients=None, config=None, val_dataset = None, test_dataset = None):

        self.model = model
        self.data_path = data_path
        self.do_profiling = do_profiling
        self.config = config
        self.val_clients = val_clients
        self.test_clients = test_clients
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset

    def run(self):
        """Main loop executed by worker nodes.
        
        This method handles the NCCL/Gloo communication between the worker and
        the server. It keeps listening for commands from the Server,
        and performs different actions on the Client assigned depending on 
        the command received.
        """
        # Single GPU flag
        single_gpu = True if size()==1 else False
        print_rank(f"Single GPU flag Client: {single_gpu}", loglevel=logging.DEBUG)
    
        if not single_gpu:
            while True:  # keeps listening for incoming server calls

                # Initialize tensors -- required by torch.distributed
                command, client_idx, mode = 0, 0, 0  # int
                lr, nround = torch.zeros(1), torch.zeros(1) # float

                # Read command
                command = _recv(command)
                print_rank(f"Command received {command} on worker {rank()}", loglevel=logging.DEBUG)

                # Receive server data -- lr, model_params
                if command == COMMAND_UPDATE:
                    print_rank(f"COMMMAND_UPDATE received {rank()}", loglevel=logging.DEBUG)                
                    lr = _recv(lr, 0)
                    model_params = _recv_gradients(0)
                    nround = _recv(nround, 0)
                    server_data = (lr, model_params, int(nround))
                    print_rank(f"Received lr: {lr} and n_params: {len(model_params)} - round {nround}", loglevel=logging.DEBUG)
                    
                elif command == COMMAND_TRAIN:
                    print_rank(f"COMMMAND_TRAIN received {rank()}", loglevel=logging.DEBUG)
                    
                    # Init profiler in training worker
                    profiler = None
                    if self.do_profiling:
                        profiler = cProfile.Profile()
                        profiler.enable()
                                    
                    # Receive client id from Server
                    client_idx = _recv(client_idx)
                    print_rank(f"Cliend idx received from Server: {client_idx}", loglevel=logging.DEBUG)

                    # Instantiate client
                    client_to_process = Client(
                            [client_idx],
                            self.config,
                            self.config['client_config']['type'] == 'optimization') 
                    
                    # Execute Client.get_data()
                    client_data = client_to_process.get_client_data()

                    # Execute Client.process_round()
                    output = client_to_process.process_round(client_data, server_data, self.model, self.data_path)

                    # Send output back to Server
                    if dist.get_backend() == "nccl":
                        # ASYNC mode -- enabled only for nccl backend
                        ack = to_device(torch.tensor(1))
                        dist.isend(tensor=ack, dst=0)
                        _send_train_output(output)
                    else:
                        # SYNC mode -- gloo backend does not have a non-blocking way to check if the operation is completed
                        gather_objects = [output for i in range(size())]
                        output = [None for _ in gather_objects]
                        dist.all_gather_object(output, gather_objects[rank()])

                    # Some cleanup
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize() if torch.cuda.is_available() else None

                    if self.do_profiling:
                        profiler.disable()
                        print_profiler(profiler)

                elif command == COMMAND_TESTVAL:
                    print_rank(f"COMMMAND_TESTVAL received {rank()}", loglevel=logging.DEBUG)

                    # Init profiler in validation worker
                    profiler = None
                    if self.do_profiling:
                        profiler = cProfile.Profile()
                        profiler.enable()
                    
                    # Receive mode and client id from Server
                    mode = _recv(mode)
                    mode = "test" if mode == -2 else "val"
                    client_idx = _recv(client_idx)
                    print_rank(f"Client idx received from Server: {client_idx}, {mode}", loglevel=logging.DEBUG)
                    
                    # Get client and dataset
                    clients = self.val_clients if mode == "val" else self.test_clients
                    dataset = self.val_dataset if mode == "val" else self.test_dataset
                    clients_queue = clients.copy()
                    assert 0 <= client_idx < len(clients_queue)
                    client_to_process = clients_queue.pop(client_idx)

                    # Execute Client.get_data()
                    client_data = client_to_process.get_client_data(dataset)
    
                    # Execute Client.run_testvalidate()
                    output = client_to_process.run_testvalidate(client_data, server_data, mode, self.model)

                    # Send output back to Server
                    if dist.get_backend() == "nccl":
                        # ASYNC mode -- enabled only for nccl backend
                        _, metrics, num_instances = output
                        metrics['num']= {'value': float(num_instances), 'higher_is_better': False}
                        output = metrics
                        print_rank(f"Worker {rank()} output {output}", loglevel=logging.DEBUG)
                        ack = to_device(torch.tensor(1))
                        dist.isend(tensor=ack, dst=0)
                        _send_metrics(output)
                    else:
                        # SYNC mode -- gloo backend does not have a non-blocking way to check if the operation is completed
                        gather_objects = [output for i in range(size())]
                        output = [None for _ in gather_objects]
                        dist.all_gather_object(output, gather_objects[rank()])
                        print_rank(f"Worker {rank()} sent output back", loglevel=logging.DEBUG)

                    # Some cleanup
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize() if torch.cuda.is_available() else None

                    if self.do_profiling:
                        profiler.disable()
                        print_profiler(profiler)

                elif command == COMMAND_TERMINATE:
                    print_rank(f"COMMMAND_TERMINATE received {rank()}", loglevel=logging.DEBUG)

                    # Some cleanup
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize() if torch.cuda.is_available() else None
                    return

                elif command == COMMAND_SYNC_NODES: # Only for sync calls
                    print_rank(f"COMMMAND_SYNC_NODES received {rank()}", loglevel=logging.DEBUG)

                    gather_objects = [None for i in range(size())]
                    output = [None for _ in gather_objects]
                    dist.all_gather_object(output, gather_objects[rank()])
                    print_rank(f"Worker IDLE {rank()} sent dummy output back", loglevel=logging.DEBUG)

                    # Some cleanup
                    torch.cuda.empty_cache()
                    torch.cuda.synchronize() if torch.cuda.is_available() else None
                else:
                    assert False, "unknown command"

    def trigger_evaluate(self):
        global GLOBAL_MESSAGE

        lr, model_params, nround = GLOBAL_MESSAGE
        server_data = (lr, model_params, int(nround))
        mode = "val"

        # Get client and dataset
        clients = self.val_clients if mode == "val" else self.test_clients
        dataset = self.val_dataset if mode == "val" else self.test_dataset
        clients_queue = clients.copy()
        client_to_process = clients_queue.pop()

        # Execute Client.get_data()
        client_data = client_to_process.get_client_data(dataset)

        # Execute Client.run_testvalidate()
        output = client_to_process.run_testvalidate(client_data, server_data, mode, self.model)
        _, metrics, num_instances = output
        metrics['num']= {'value': float(num_instances), 'higher_is_better': False}
        GLOBAL_MESSAGE = (_, metrics, num_instances)

        # Some cleanup
        torch.cuda.empty_cache()
        torch.cuda.synchronize() if torch.cuda.is_available() else None
    
    def trigger_train(self):
        global GLOBAL_MESSAGE
        lr, model_params, nround, client_idx = GLOBAL_MESSAGE
        server_data = (lr, model_params, int(nround))

        # Instantiate client
        client_to_process = Client([client_idx], self.config, self.config['client_config']['type'] == 'optimization') 
    
        # Execute Client.get_data()
        client_data = client_to_process.get_client_data()

        # Execute Client.process_round()
        GLOBAL_MESSAGE = client_to_process.process_round(client_data, server_data, self.model, self.data_path)

        # Some cleanup
        torch.cuda.empty_cache()
        torch.cuda.synchronize() if torch.cuda.is_available() else None

================================================
FILE: core/metrics.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
In this file we define the wrapper class for 
implementing metrics.
'''
import logging

import numpy as np
import torch

from utils import print_rank

class Metrics():

    def __init__(self):
        super().__init__()

    def compute_metrics(self,dataloader, model):
        '''This method is called by ´run_validation_generic´ function 
        inside trainer.py .
        
        This is just a helper function that computes the metrics returned 
        in the inference function inside ´model.py´.
        '''
        print_rank("Computing metrics")
        return self.call_inference(dataloader,model)

    def call_inference(self, dataloader, model):
        
        metrics, sum_metrics = dict(), dict()
        output_tot = {"probabilities": [], "predictions": [], "labels":[]}
        counter = 0

        with torch.no_grad():
            for _, batch in enumerate(dataloader):
                val_loss = model.loss(batch).item()
                inf_results = model.inference(batch)
                inf_results ['loss'] = {'value': val_loss,'higher_is_better': False}
                output = inf_results.pop('output')
                batch_size = inf_results.pop('batch_size')

                for key in inf_results.keys():
                    if not isinstance(inf_results[key], dict):
                        inf_results[key] = {'value':inf_results[key],'higher_is_better': True}
                    sum_metrics[key] = [] if not key in sum_metrics else sum_metrics[key]

                if isinstance(output, dict):
                    output_tot["probabilities"].append(output["probabilities"])
                    output_tot["predictions"].append(output["predictions"])
                    output_tot["labels"].append(output["labels"])

                for q in inf_results.keys():
                    sum_metrics[q].append(inf_results[q]['value']* batch_size)
                counter += batch_size
                torch.cuda.empty_cache()

        output_tot["probabilities"] = np.concatenate(output_tot["probabilities"]) if output_tot["probabilities"] else []
        output_tot["predictions"] = np.concatenate(output_tot["predictions"]) if output_tot["predictions"] else []
        output_tot["labels"] = np.concatenate(output_tot["labels"]) if output_tot["labels"] else []

        # Post-processing of metrics
        print_rank(f"validation complete {counter}", loglevel=logging.DEBUG)
        model.set_train()

        for k in inf_results.keys():
            metrics[k] = inf_results[k]
            metrics[k]['value'] = sum(sum_metrics[k])/counter

        print_rank(f"validation examples {counter}", loglevel=logging.DEBUG)
        torch.cuda.empty_cache()
        
        return output_tot, metrics


================================================
FILE: core/model.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch as T
from abc import ABC, abstractmethod

class BaseModel(ABC, T.nn.Module):
    '''This is a wrapper class for PyTorch models.'''

    @abstractmethod
    def __init__(self,**kwargs):
        super(BaseModel, self).__init__()
        
    @abstractmethod
    def loss(self, input):
        '''Performs forward step and computes the loss

        Returns:
            torch: Computed loss.
        '''
        pass
    
    @abstractmethod
    def inference(self, input):
        '''Performs forward step and computes metrics
             
        Returns:
            dict: The metrics to be computed. The following keys are
            the minimum required by FLUTE during evaluations rounds: 
                - output
                - acc
                - batch_size

            More metrics can be computed by adding the key with a
            dictionary that includes the fields ´value´ and 
            ´higher_is_better´ as follows:

            {'output':output, 
             'acc': accuracy, 
             'batch_size': n_samples, 
             'f1_score': {'value':f1,'higher_is_better': True}}
        '''
        pass

    def set_eval(self):
        '''Bring the model into evaluation mode'''
        self.eval()

    def set_train(self):
        '''Bring the model into training mode'''
        self.train()


================================================
FILE: core/schema.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# '''
# In this file we define the  schema for the configuration 
# files that will be pass it to an instance of the Validator 
# in e2e_trainer.py 
# '''

{
    'model_config':{
            'required': True,
            'type': 'dict',
            'allow_unknown': True,
            'schema': {
                'model_type': {'required': True, 'type':'string'},
                'model_folder': {'required': True, 'type':'string'},
                'BERT':{
                    'required':False,
                    'type': 'dict',
                    'allow_unknown': True,
                    'schema':{
                        'loader_type': {'required': False, 'type': 'string'},
                        'model': {
                            'required': True,
                            'type': 'dict',
                            'allow_unknown': True,
                            'schema': {
                                'model_name_or_path': {'required': False, 'type':'string'},
                                'model_name': {'required': True, 'type':'string'},
                                'process_line_by_line': {'required': True, 'type':'boolean'},
                            }
                        }
                    }
                },
            }
    },

    'dp_config':{
            'required': True,
            'type': 'dict',
            'allow_unknown': True,
            'schema': {
                'enable_local_dp': {'required': True, 'type':'boolean'},
                'enable_global_dp': {'required': False, 'type':'boolean'},
                'eps': {'required': False, 'type':'float'},
                'delta': {'required': False, 'type':'float'},
                'global_sigma': {'required': False, 'type':'float'},
                'max_grad': {'required': False, 'type':'float'},
                'max_weight': {'required': False, 'type':'float'},
                'weight_scaler': {'required': False, 'type':'float'},
                'min_weight': {'required': False, 'type':'float'},
                }
    },

    'privacy_metrics_config':{
            'required': True,
            'type': 'dict',
            'allow_unknown': True,
            'schema': {
                'apply_metrics': {'required': True, 'type':'boolean'},
                'apply_indices_extraction': {'required': False, 'type':'boolean'},
                'allowed_word_rank': {'required': False, 'type':'integer'},
                'apply_leakage_metric': {'required': False, 'type':'boolean'},
                'max_leakage': {'required': False, 'type':'float'},
                'adaptive_leakage_threshold': {'required': False, 'type':'float'},
                'is_leakage_weighted': {'required': False, 'type':'boolean'},
                'attacker_optimizer_config': {'required': False, 'type':'dict', 'allow_unknown': True},
                }
    },

    'strategy':{
        'required': True,
        'type': 'string'
    },

    'server_config':{
            'required': True,
            'type': 'dict',
            'allow_unknown': True,
            'schema': {
                'wantRL': {'required': True, 'type':'boolean', 'allow_unknown': True},
                'RL': {'required': False, 'type':'dict'},
                'resume_from_checkpoint': {'required': True, 'type':'boolean'},
                'do_profiling': {'required': True, 'type':'boolean'},
                'optimizer_config': {
                    'required': True, 
                    'type':'dict',
                    'allow_unknown': True,
                    'schema': {
                        'type': {'required': True, 'type':'string', 'allowed':['sgd', 'adam','adamax', 'lars', 'LarsSGD', 'lamb', 'adamW']},
                        'lr': {'required': True, 'type':'float'},
                        'weight_decay': {'required': False, 'type':'float'},
                    }
                },
                'annealing_config': {
                    'required': True, 
                    'type':'dict',
                    'allow_unknown': True,
                    'schema': {
                        'type': {'required': True, 'type':'string'},
                        'step_interval': {'required': True, 'type':'string'},
                        'gamma': {'required': True, 'type':'float'},
                        'step_size': {'required': True, 'type':'integer'},
                    }
                },
                'val_freq': {'required': False, 'type':'integer', 'default': 1},
                'rec_freq': {'required': False, 'type':'integer', 'default': 8},
                'initial_val': {'required': False, 'type':'boolean', 'default': True},
                'initial_rec': {'required': False, 'type':'boolean', 'default': False},
                'max_iteration': {'required': False, 'type':'integer', 'default': 10000},
                'num_clients_per_iteration': {'required': False, 'type':'integer', 'default': 1},
                'data_config': {
                    'required': True, 
                    'type':'dict',
                    'allow_unknown': True,
                    'keysrules':{'forbidden':['num_clients']},
                    'schema': {
                        'val': {
                            'required': True, 
                            'type':'dict',
                            'allow_unknown': True,
                            'schema': {
                                'batch_size': {'required': False, 'type':'integer', 'default': 40},
                                'val_data': {'required': True, 'type':'string', 'nullable':True},
                                'tokenizer_type': {'required': False, 'type':'string'},
                                'prepend_datapath': {'required': False, 'type':'boolean', 'default': False},
                                'vocab_dict': {'required': False, 'type':'string'},
                                'pin_memory': {'required': False, 'type':'boolean', 'default': True},
                                'num_workers': {'required': False, 'type':'integer', 'default': 1},
                                'num_frames': {'required': False, 'type':'integer', 'default': 0},
                                'max_batch_size': {'required': False, 'type':'integer', 'default': 0},
                                'max_num_words': {'required': False, 'type':'integer'},
                                'max_grad_norm': {'required': False, 'type':'float', 'default': 5.0 },
                                'unsorted_batch': {'required': False, 'type':'boolean', 'default': False},
                                'cache_dir': {'required': False, 'type':'string'},
                            },
                        },
                        'test': {
                            'required': True, 
                            'type':'dict',
                            'allow_unknown': True,
                            'schema': {
                                'batch_size': {'required': False, 'type':'integer', 'default': 40},
                                'test_data': {'required': True, 'type':'string', 'nullable': True},
                                'tokenizer_type': {'required': False, 'type':'string'},
                                'prepend_datapath': {'required': False, 'type':'boolean', 'default': False},
                                'vocab_dict': {'required': False, 'type':'string'},
                                'pin_memory': {'required': False, 'type':'boolean', 'default': True},
                                'num_workers': {'required': False, 'type':'integer', 'default': 1},
                                'num_frames': {'required': False, 'type':'integer', 'default': 0},
                                'max_batch_size': {'required': False, 'type':'integer', 'default': 0},
                                'max_num_words': {'required': False, 'type':'integer'},
                                'max_grad_norm': {'required': False, 'type':'float', 'default': 5.0 },
                                'unsorted_batch': {'required': False, 'type':'boolean', 'default': False},
                                'cache_dir': {'required': False, 'type':'string'},
                            },
                        },
                        'train': {
                            'required': False, 
                            'type':'dict',
                            'allow_unknown': True,
                            'schema': {
                                'batch_size': {'required': False, 'type':'integer', 'default': 40},
                                'train_data_server': {'required': False, 'type':'string'},
                                'desired_max_samples': {'required': False, 'type':'integer'},
                                'tokenizer_type': {'required': False, 'type':'string'},
                                'prepend_datapath': {'required': False, 'type':'boolean', 'default': False},
                                'vocab_dict': {'required': False, 'type':'string'},
                                'pin_memory': {'required': False, 'type':'boolean', 'default': True},
                                'num_workers': {'required': False, 'type':'integer', 'default': 1},
                                'num_frames': {'required': False, 'type':'integer', 'default': 0},
                                'max_batch_size': {'required': False, 'type':'integer', 'default': 0},
                                'max_num_words': {'required': False, 'type':'integer'},
                                'max_grad_norm': {'required': False, 'type':'float', 'default': 5.0 },
                                'unsorted_batch': {'required': False, 'type':'boolean', 'default': False},
                                'cache_dir': {'required': False, 'type':'string'},
                            }
                        },
                    }
                },
                'type': {
                    'required': False, 
                    'type':'string',
                    'allowed':['model_optimization', 'personalization'],
                    'default': 'model_optimization'
                },
                'aggregate_median': {'required': False, 'type':'string'},
                'initial_lr_client': {'required': True, 'type':'float'},
                'lr_decay_factor': {'required': True, 'type':'float'},
                'weight_train_loss': {'required': True, 'type':'string'},
                'best_model_criterion': {'required': False, 'type':'string', 'default':'loss'},
                'fall_back_to_best_model': {'required': False, 'type':'boolean', 'default': False},
                'softmax_beta': {'required': True, 'type':'float'},
                'server_replay_config': {
                    'required': False, 
                    'type':'dict',
                    'schema':{
                        'server_iterations': {'required': True, 'type':'integer'},
                        'optimizer_config': {
                            'required': True, 
                            'type':'dict',
                            'allow_unknown': True,
                            'schema': {
                                'type': {'required': True, 'type':'string', 'allowed':['sgd', 'adam','adamax', 'lars', 'LarsSGD', 'lamb', 'adamW']},
                                'lr': {'required': True, 'type':'float'},
                                'weight_decay': {'required': False, 'type':'float'},
                                'amsgrad': {'required': False, 'type':'boolean'},
                            }
                        },
                    }
                },
                'nbest_task_scheduler': {
                    'required': False, 
                    'type':'dict',
                    'schema':{
                        'num_tasks': {'required': True, 'type':'integer'}, 
                        'iteration_per_task': {'required': True, 'type':'integer'},
                    }
                },
            }
    },

    'client_config':{
        'required': True,
        'type': 'dict',
        'allow_unknown': True,
        'schema': {
            'meta_learning': {'required': False, 'type':'string'},
            'stats_on_smooth_grad': {'required': False, 'type':'boolean'},
            'ignore_subtask': {'required': True, 'type':'boolean'},
            'num_skips_threshold': {'required': False, 'type':'integer'},
            'copying_train_data': {'required': False, 'type':'boolean'},
            'do_profiling': {'required': True, 'type':'boolean'},
            'data_config': {
                'required': True, 
                'type':'dict',
                'allow_unknown': True,
                'keysrules':{'forbidden':['num_clients']},
                'schema': {
                    'train': {
                        'required': True, 
                        'type':'dict',
                        'allow_unknown': True,
                        'schema': {
                            'batch_size': {'required': False, 'type':'integer', 'default': 40},
                            'list_of_train_data': {'required': True, 'type':'string', 'nullable': True},
                            'tokenizer_type': {'required': False, 'type':'string'},
                            'prepend_datapath': {'required': False, 'type':'boolean', 'default': False},
                            'vocab_dict': {'required': False, 'type':'string'},
                            'pin_memory': {'required': False, 'type':'boolean', 'default': True},
                            'num_workers': {'required': False, 'type':'integer', 'default': 1},
                            'num_frames': {'required': False, 'type':'integer', 'default': 0},
                            'max_batch_size': {'required': False, 'type':'integer', 'default': 0},
                            'max_num_words': {'required': False, 'type':'integer'},
                            'max_grad_norm': {'required': False, 'type':'float', 'default': 5.0 },
                            'unsorted_batch': {'required': False, 'type':'boolean', 'default': False},
                        }
                    },
                }
            },
            'type': {
                'required': False, 
                'type':'string',
                'allowed':['optimization', 'gradient_computation'],
                'default': 'gradient_computation',
            },
            'meta_optimizer_config': {
                'required': False, 
                'type':'dict',
                'allow_unknown': True,
                'schema': {
                    'type': {'required': True, 'type':'string', 'allowed':['sgd', 'adam','adamax', 'lars', 'LarsSGD', 'lamb', 'adamW']},
                    'lr': {'required': True, 'type':'float'},
                }
            },
            'optimizer_config': {
                'required': True, 
                'type':'dict',
                'allow_unknown': True,
                'schema': {
                    'type': {'required': True, 'type':'string', 'allowed':['sgd', 'adam','adamax', 'lars', 'LarsSGD', 'lamb', 'adamW']},
                    'lr': {'required': False, 'type':'float'},
                    'weight_decay': {'required': False, 'type':'float'},
                }
            },
            'annealing_config': {
                'required': False, 
                'type':'dict',
                'allow_unknown': True,
                'schema': {
                    'type': {'required': True, 'type':'string'},
                    'step_interval': {'required': True, 'type':'string'},
                    'gamma': {'required': False, 'type':'float'},
                    'step_size': {'required': False, 'type':'integer'},
                }
            },
            'ss_config': {'required': False, 'type':'dict', 'allow_unknown': True},
        }
    },
}

================================================
FILE: core/server.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
In this file, we define the classes that live inside 'worker 0', the worker
responsible for orchestration and aggregation. The main class is the
OptimizationServer, which sends clients to the other workers to process and
combines the resulting models.
'''

import json
import logging
import os
import random
import shutil
import time
from collections import defaultdict

import numpy as np
import torch

# Internal imports
import core.federated as federated
from core.evaluation import Evaluation
from core.client import Client
from .strategies import select_strategy
from .trainer import (
    ModelUpdater,
    Trainer,
    set_component_wise_lr,
)
from utils import (
    get_lr,
    print_rank,
    update_json_log,
    to_device,
)

# For profiling
import cProfile
import pstats

# AzureML-related libs
from azureml.core import Run
run = Run.get_context()


class OptimizationServer(federated.Server):
    def __init__(self, num_clients, model, optimizer, ss_scheduler, data_path, model_path, server_train_dataloader,
                 config, idx_val_clients, idx_test_clients, single_worker):
        '''Implement Server's orchestration and aggregation.

        This is the main Server class, that actually implements orchestration
        and aggregation, inheriting from `federated.Server`, which deals with
        communication only.

        The `train` method is central in FLUTE, as it defines good part of what
        happens during training.

        Args:
            num_clients (int): total available clients.
            model (torch.nn.Module): neural network model.
            optimizer (torch.optim.Optimizer): optimizer.
            ss_scheduler: scheduled sampling scheduler.
            data_path (str): points to where data is.
            model_path (str): points to where pretrained model is.
            server_train_dataloader (torch.utils.data.DataLoader): dataloader for training
            config (dict): JSON style configuration parameters
            idx_val_clients (list): validation client ids
            idx_test_clients (list): testing clients ids
        '''

        super().__init__()

        # Initialize all attributes from arguments
        self.client_idx_list = list(range(num_clients))
        self.config = config
        server_config = config['server_config']
        decoder_config = config.get('decoder_config', None)

        self.max_iteration = server_config['max_iteration']
        self.do_clustering = server_config.get('clustering', False)
        self.send_dicts = server_config.get('send_dicts', False)

        self.num_clients_per_iteration = [int(x) for x in server_config['num_clients_per_iteration'].split(',')] \
            if isinstance(server_config['num_clients_per_iteration'], str) \
            else [server_config['num_clients_per_iteration']]

        self.val_freq = server_config['val_freq']
        self.req_freq = server_config['rec_freq']

        self.evaluation = Evaluation(config, model_path, self.process_testvalidate, idx_val_clients, idx_test_clients, single_worker)

        # TODO: does this need to be adjusted for custom metrics?
        self.metrics = dict()

        self.model_backup_freq = server_config.get('model_backup_freq', 100)
        self.worker_trainer_config = server_config.get('trainer_config', {})

        self.aggregate_median = server_config['aggregate_median']
        self.initial_lr_client = server_config.get('initial_lr_client', -1.0)
        self.lr_decay_factor = server_config.get('lr_decay_factor', 1.0)

        self.model_type = config['model_config']['model_type']
        self.quant_thresh = config['client_config'].get('quant_thresh', None)
        self.quant_bits = config['client_config'].get('quant_bits', 10)

        self.list_of_train_data = config['client_config']['data_config']['train']['list_of_train_data']
        self.data_path = data_path
        self.single_worker = single_worker

        # Get max grad norm from data config
        if 'train' in server_config['data_config']:
            max_grad_norm = server_config['data_config']['train'].get('max_grad_norm', None)
        else:
            max_grad_norm = None

        # Creating an instance to update the model with stats aggregated from workers
        self.worker_trainer = ModelUpdater(
            model=model,
            optimizer=optimizer,
            ss_scheduler=ss_scheduler,
            train_dataloader=server_train_dataloader,
            val_dataloader=None,
            max_grad_norm=max_grad_norm,
            anneal_config=server_config['annealing_config'],
            model_type=self.model_type,
            decoder_config=decoder_config
        )
        self.metrics['worker_trainer'] = self.worker_trainer
        # Creating an instance for the server-side trainer (runs mini-batch SGD)
        self.server_replay_iterations = None
        self.server_trainer = None
        if server_train_dataloader is not None:
            assert 'server_replay_config' in server_config, 'server_replay_config is not set'
            assert 'optimizer_config' in server_config[
                'server_replay_config'], 'server-side replay training optimizer is not set'
            self.server_optimizer_config = server_config['server_replay_config']['optimizer_config']
            self.server_trainer_config = server_config['server_replay_config'].get('trainer_config', {})
            self.server_replay_iterations = server_config['server_replay_config']['server_iterations']
            self.server_trainer = Trainer(
                model=model,
                optimizer=None,
                ss_scheduler=ss_scheduler,
                train_dataloader=server_train_dataloader,
                server_replay_config=server_config['server_replay_config'],
                max_grad_norm=server_config['server_replay_config']\
                                            .get('max_grad_norm',server_config['data_config']['train']\
                                                .get('max_grad_norm',None)),
                anneal_config=server_config['server_replay_config'].get('annealing_config', None),
                ignore_subtask = server_config['server_replay_config'].get('ignore_subtask', False)
            )

        self.skip_model_update = False  # will not update the model if True

        self.train_loss = 0.0
        self.model_path = model_path
        self.best_model_criterion = server_config['best_model_criterion']
        self.fall_back_to_best_model = server_config['fall_back_to_best_model']
        self.last_model_path = os.path.join(self.model_path, 'latest_model.tar')
        self.best_model_path = os.path.join(self.model_path,
            'best_val_{}_model.tar'.format(self.best_model_criterion))
        self.log_path = os.path.join(self.model_path, 'status_log.json')
        self.cur_iter_no = 0  # keep the iteration number for Tensor board plotting
        self.lr_weight = 1.0

        self.losses = []
        self.no_label_updates = 0  # no. label updates

        # Update the parameters above if the log file
        if server_config.get('resume_from_checkpoint', False):
            self.load_saved_status()

        # Decoding config
        self.decoder_config = decoder_config
        self.spm_model = server_config['data_config']['test'].get('spm_model', None)

        self.do_profiling = server_config.get('do_profiling', False)

        StrategyClass = select_strategy(config['strategy'])
        self.strategy = StrategyClass('server', self.config, self.model_path)
        print_rank(f'Server successfully instantiated strategy {self.strategy}', loglevel=logging.DEBUG)

    def load_saved_status(self):
        '''Load checkpoint from disk'''

        # Check if model is on disk, if so loads it onto trainer
        if os.path.exists(self.last_model_path):
            print_rank('Resuming from checkpoint model {}'.format(self.last_model_path))
            self.worker_trainer.load(self.last_model_path, update_lr_scheduler=True, update_ss_scheduler=True)
            if self.server_trainer is not None:
                self.server_trainer.model = self.worker_trainer.model  # make sure that the models are in sync

        # Check if log is on disk, if so loads it onto current stats
        if os.path.exists(self.log_path):
            with open(self.log_path, 'r') as logfp:  # loading the iteration no., best loss and CER
                elems = json.load(logfp)
                self.cur_iter_no = elems.get('i', 0)
                self.metrics['best_val_loss'] = elems.get('best_val_loss', float('inf'))
                self.metrics['best_val_acc'] = elems.get('best_val_acc', 0)
                self.metrics['best_test_loss'] = elems.get('best_test_loss', float('inf'))
                self.metrics['best_test_acc'] = elems.get('best_test_acc', 0)
                self.lr_weight = elems.get('weight', 1.0)
                self.no_label_updates = elems.get('num_label_updates', 0)
                print_rank(f'Resuming from status_log: cur_iter: {self.cur_iter_no}')

    def run(self):
        '''Trigger training.

        This is a simple wrapper to the `train` method.
        '''
        print_rank('server started')
        self.train()
        print_rank('server terminated')

    def train(self):
        '''Main method for training.'''

        self.run_stats = {
            'secsPerClientRound': [],
            'secsPerClient': [],
            'secsPerClientTraining': [],
            'secsPerClientSetup': [],
            'secsPerClientFull': [],
            'secsPerRoundHousekeeping': [],
            'secsPerRoundTotal': [],
            'communicationCosts': []
        }

        run.log('Max iterations', self.max_iteration)
        try:
            self.worker_trainer.model = to_device(self.worker_trainer.model)

            # Do an initial validation round to understand the pretrained model's validation accuracy
            # Skip if we resumed from a checkpoint (cur_iter_no > 0)
            eval_list = []
            if self.cur_iter_no == 0:

                if self.config['server_config']['initial_rec']:
                    eval_list.append('test')
                if self.config['server_config']['initial_val']:
                    eval_list.append('val')
                    run.log('LR for agg. opt.', get_lr(self.worker_trainer.optimizer))

                print_rank("Running {} at itr={}".format(eval_list, self.cur_iter_no))
                self.metrics = self.evaluation.run(eval_list, self.metrics, metric_logger=run.log)
                eval_list = [] # some cleanup

            # Dump all the information in aggregate_metric
            print_rank('Saving Model Before Starting Training', loglevel=logging.INFO)
            for token in ['best_val_loss', 'best_val_acc', 'best_test_acc', 'latest']:
                self.worker_trainer.save(
                    model_path=self.model_path,
                    token=token,
                    config=self.config['server_config']
                )

            # Training loop
            self.worker_trainer.model.train()
            for i in range(self.cur_iter_no, self.max_iteration):
                begin = time.time()
                metrics_payload = {}

                def log_metric(k, v):
                    metrics_payload[k] = v

                print_rank('==== iteration {}'.format(i))
                log_metric('Current iteration', i)

                # Initial value for the learning rate of the worker
                initial_lr = self.initial_lr_client * self.lr_weight
                print_rank('Client learning rate {}'.format(initial_lr))

                # Run training on clients
                self.worker_trainer.model.zero_grad()
                self.train_loss = []

                if self.send_dicts: # Send state dictionaries
                    glob_payload = [self.worker_trainer.model.state_dict()[param_key].to(torch.device('cpu')) for param_key in self.worker_trainer.model.state_dict()]
                else: # Send parameters
                    glob_payload = [p.data.to(torch.device('cpu')) for p in self.worker_trainer.model.parameters()]
                
                server_data = (initial_lr, glob_payload, i)

                # Random number of clients per iteration
                if len(self.num_clients_per_iteration) > 1:
                    num_clients_curr_iter = random.randint(
                        self.num_clients_per_iteration[0],
                        self.num_clients_per_iteration[1]
                    )
                else:
                    num_clients_curr_iter = self.num_clients_per_iteration[0]
                log_metric('Clients for round', num_clients_curr_iter)

                # Perform annealing in quantization threshold
                if self.quant_thresh is not None:
                    self.config['client_config']['quant_thresh'] *= self.config['client_config'].get('quant_anneal', 1.0)
                    self.quant_thresh = self.config['client_config']['quant_thresh']
                    log_metric('Quantization Thresh.', self.config['client_config']['quant_thresh'])

                #  Create the pool of clients -- sample from this pool to assign to workers
                sampled_idx_clients = random.sample(self.client_idx_list,
                    num_clients_curr_iter) if num_clients_curr_iter > 0 else self.client_idx_list
                
                # Initialize stats
                clients_begin = time.time()

                client_losses = []
                client_mag_grads = []
                client_mean_grads = []
                client_var_grads = []
                client_norm_grads = []

                self.run_stats['secsPerClient'].append([])
                self.run_stats['secsPerClientFull'].append([])
                self.run_stats['secsPerClientTraining'].append([])
                self.run_stats['secsPerClientSetup'].append([])
                self.run_stats['communicationCosts'].append([])

                # Check if we want privacy metrics
                apply_privacy_metrics = self.config.get('privacy_metrics_config', None) and \
                    self.config['privacy_metrics_config']['apply_metrics']
                adaptive_leakage = apply_privacy_metrics and \
                    self.config['privacy_metrics_config'].get('adaptive_leakage_threshold', None)
                if apply_privacy_metrics:
                    privacy_metrics_stats = defaultdict(list)

                # Initialize profiler
                profiler = None
                if self.do_profiling:
                    profiler = cProfile.Profile()
                    profiler.enable()

                # Reset gradient for the model before assigning the new gradients
                self.worker_trainer.model.zero_grad()
                
                print_rank(f"Clients sampled from server {sampled_idx_clients}", loglevel=logging.DEBUG)
                for client_output in self.process_clients(sampled_idx_clients, server_data, self.single_worker):
                    # Process client output
                    client_timestamp = client_output['ts']
                    client_stats = client_output['cs']
                    client_loss = client_output['tl']
                    client_mag_grad = client_output['mg']
                    client_mean_grad = client_output['ng']
                    client_var_grad = client_output['vg']
                    client_norm_grad = client_output['rg']
                    client_payload = client_output['pl']

                    if apply_privacy_metrics:
                        privacy_stats = client_output['ps']
                        for metric, value in privacy_stats.items():
                            privacy_metrics_stats[metric].append(value)

                    self.run_stats['communicationCosts'][-1].append(time.time() - client_timestamp)

                    # Get actual pseudo-gradients for aggregation
                    payload_processed = self.strategy.process_individual_payload(self.worker_trainer, client_payload)
                    if not payload_processed:
                        print_rank('Dropping client', loglevel=logging.DEBUG)
                        num_clients_curr_iter -= 1
                        continue

                    # Aggregate stats
                    self.train_loss.append(client_loss)
                    client_losses.append(client_loss)
                    client_mag_grads.append(client_mag_grad.item())
                    client_mean_grads.append(client_mean_grad.item())
                    client_var_grads.append(client_var_grad.item())
                    client_norm_grads.append(client_norm_grad.item())

                    # Mark the end of client processing
                    client_end = time.time()

                    self.run_stats['secsPerClientFull'][-1].append(client_stats['full cost'])
                    self.run_stats['secsPerClientTraining'][-1].append(client_stats['training'])
                    self.run_stats['secsPerClientSetup'][-1].append(client_stats['setup'])
                    self.run_stats['secsPerClient'][-1].append(client_end - clients_begin)

                # Tear down profiler
                if self.do_profiling:
                    profiler.disable()
                    stats = pstats.Stats(profiler)
                    stats.sort_stats('cumulative').print_stats()

                # Prepare output
                client_mag_grads = np.array(client_mag_grads)
                client_mean_grads = np.array(client_mean_grads)
                client_var_grads = np.array(client_var_grads)
                client_norm_grads = np.array(client_norm_grads)

                client_stats = (client_mag_grads, client_mean_grads, client_var_grads)

                dump_norm_stats = self.config.get('dump_norm_stats', False)
                if dump_norm_stats:
                    with open(os.path.join(self.model_path, 'norm_stats.txt'), 'a', encoding='utf-8') as outF:
                        outF.write('{}\n'.format(json.dumps(list(client_norm_grads))))

                # Print the privacy metrics
                if apply_privacy_metrics:
                    for metric, values in privacy_metrics_stats.items():
                        if metric == 'Dropped clients':
                            log_metric(metric, sum(values))
                        else:
                            log_metric(metric, max(values))

                if type(adaptive_leakage) is float:
                    values = privacy_metrics_stats['Practical epsilon (Max leakage)']
                    new_threshold = list(sorted(values))[int(adaptive_leakage*len(values))]
                    print_rank('Updating leakage threshold to {}'.format(new_threshold))
                    self.config['privacy_metrics_config']['max_allowed_leakage'] = new_threshold

                # Mark that all clients have been processed
                end = time.time()
                self.run_stats['secsPerClientRound'].append(end - begin)
                begin = end

                # Log the training loss to tensorboard/AML
                log_metric('Training loss', sum(self.train_loss))

                # Combine payloads
                self.losses = self.strategy.combine_payloads(
                    worker_trainer=self.worker_trainer,
                    curr_iter=i,
                    num_clients_curr_iter=num_clients_curr_iter,
                    total_clients = len(self.client_idx_list),
                    client_stats=client_stats,
                    logger=log_metric,
                )
                
                # Run a couple of iterations of training data on the server
                if self.server_trainer is not None:
                    print_rank('Running replay iterations on server')

                    if 'updatable_names' in self.server_trainer_config:
                        set_component_wise_lr(
                            self.worker_trainer.model,
                            self.server_optimizer_config,
                            self.server_trainer_config['updatable_names']
                        )
                    self.server_trainer.prepare_iteration(self.worker_trainer.model)
                    self.server_trainer.train_desired_samples(self.server_replay_iterations)
                    self.worker_trainer.model.load_state_dict(self.server_trainer.model.state_dict())
                    torch.cuda.empty_cache()

                # Update a sampling scheduler
                print_rank('Run ss scheduler')
                self.worker_trainer.run_ss_scheduler()

                # Run inference and score on val/test depending on the iter. number
                if ((i+1) % self.val_freq) == 0:
                    eval_list.append("val")
                if ((i+1) % self.req_freq) == 0 :
                    eval_list.append("test")
                
                if len(eval_list)> 0:
                    print_rank('Running {} at itr={}'.format(eval_list,i+1))
                    self.metrics['worker_trainer'] = self.worker_trainer
                    if hasattr(self.strategy,'tmp_unsup'):
                        self.metrics['tmp_sup'] = self.strategy.tmp_sup
                        self.metrics['tmp_unsup'] = self.strategy.tmp_unsup
                    self.metrics = self.evaluation.run(eval_list, self.metrics, metric_logger=run.log)
                    self.losses = self.evaluation.losses
                    eval_list = []

                # Create a schedule for the initial_lr (for the worker)
                if 'val' in eval_list:
                    run.log('LR for agg. opt.', get_lr(self.worker_trainer.optimizer))
                    if not (self.losses[0] < self.metrics['best_val_loss']):
                        self.lr_weight *= self.lr_decay_factor
                        print_rank('LOG: Client weight of learning rate {}..'.format(self.lr_weight))

                # Backup the current best models
                self.backup_models(i)

                # Fall back to the best model if the option is enabled
                self.fall_back_to_prev_best_status()

                # Logging the latest best values only after the 1st val/test round has been executed
                if len(self.metrics) > 1:
                    update_json_log(
                        self.log_path,
                        {
                            'i': i + 1,
                            'best_val_loss': float(self.metrics['best_val_loss']),
                            'best_val_acc': float(self.metrics['best_val_acc']),
                            'best_test_loss': float(self.metrics['best_test_loss']),
                            'best_test_acc': float(self.metrics['best_test_acc']),
                            'weight': float(self.lr_weight),
                            'num_label_updates': int(self.no_label_updates)
                        },
                    )

                end = time.time()

                # Aggregate stats
                self.run_stats['secsPerRoundHousekeeping'].append(end - begin)
                self.run_stats['secsPerRoundTotal'].append(self.run_stats['secsPerClientRound'][-1] + \
                    self.run_stats['secsPerRoundHousekeeping'][-1])

                log_metric('secsPerRoundTotal', self.run_stats['secsPerRoundTotal'][-1])
                if self.do_profiling:
                    log_metric('secsPerClientRound', self.run_stats['secsPerClientRound'][-1])
                    log_metric('secsPerRoundHousekeeping', self.run_stats['secsPerRoundHousekeeping'][-1])

                    metrics_for_stats = [
                        'secsPerClient',
                        'secsPerClientTraining',
                        'secsPerClientFull',
                        'secsPerClientSetup',
                        'communicationCosts',
                    ]

                    for metric in metrics_for_stats:
                        log_metric(f'{metric}Mean', np.mean(self.run_stats[metric][-1]))
                        log_metric(f'{metric}Median', np.median(self.run_stats[metric][-1]))
                        log_metric(f'{metric}Max', max(self.run_stats[metric][-1]))

                    for k in self.run_stats:
                        if k in metrics_for_stats:
                            print_rank('{}: {}'.format(k, max(self.run_stats[k][-1])), loglevel=logging.DEBUG)
                        else:
                            print_rank('{}: {}'.format(k, self.run_stats[k][-1]), loglevel=logging.DEBUG)

                # Log all the metrics
                for k in metrics_payload:
                    run.log(k, metrics_payload[k])

        finally:  # perform cleanup even if error was raised above
            self.terminate_workers(terminate=(not self.do_clustering))

    def backup_models(self, i):
        '''Save the current best models.

        Save CER model, the best loss model and the best WER model. This occurs
        at a specified period.

        Args:
            i: no. of iterations.
        '''

        # Always save the latest model
        self.worker_trainer.save(
            model_path=self.model_path,
            token='latest',
            config=self.config['server_config'],
        )

        if (i % self.model_backup_freq) == 0:  # save the current best models
            self.worker_trainer.save(
                model_path=self.model_path,
                token='epoch{}'.format(i),
                config=self.config['server_config']
            )

            for bodyname in ['best_val_acc', 'best_val_loss', 'best_test_acc']:
                src_model_path = os.path.join(self.model_path, '{}_model.tar'.format(bodyname))
                if os.path.exists(src_model_path):
                    dst_model_path = os.path.join(self.model_path, 'epoch{}_{}_model.tar'.format(i, bodyname))
                    shutil.copyfile(src_model_path, dst_model_path)
                    print_rank('Saved {}'.format(dst_model_path))

    def fall_back_to_prev_best_status(self):
        '''Go back to the past best status and switch to the recent best model.'''

        if self.fall_back_to_best_model:
            print_rank('falling back to model {}'.format(self.best_model_path))

            # Save current learning rate
            tmp_lr = get_lr(self.worker_trainer.optimizer)

            # Load previous best model
            self.worker_trainer.load(self.best_model_path, update_lr_scheduler=False, update_ss_scheduler=False)

            # Update previous learning rate on optimizer
            for g in self.worker_trainer.optimizer.param_groups:
                g['lr'] = tmp_lr

            if self.server_trainer is not None:
                self.server_trainer.model = self.worker_trainer.model  # make sure that the models are in sync


def select_server(server_type):
    '''Select a server type using different possible strings.

    Right now this just returns `OptimizationServer`, but this
    function could be useful when there are multiple choices of
    server.

    Args:
        server_type (str): indicates server choice.
        config (dict): config parsed from YAML, passed so that
            parameters can be used to select a given server.
    '''
    if server_type == "personalization":
        from experiments.cv.server import PersonalizationServer
        return PersonalizationServer
    else:
        return OptimizationServer


================================================
FILE: core/strategies/__init__.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from .base import BaseStrategy
from .fedavg import FedAvg
from .dga import DGA
from .fedlabels import FedLabels

def select_strategy(strategy):
    ''' Selects the aggregation strategy class
    
    NOTE: FedProx uses FedAvg weights during aggregation, 
    which are proportional to the number of samples in 
    each client.
    '''
    if strategy.lower() == 'dga':
        return DGA
    elif strategy.lower() in ['fedavg', 'fedprox']:
        return FedAvg
    elif strategy.lower() == 'fedlabels':
        return FedLabels
    else:
        raise ValueError(f'cannot use strategy f{strategy}')

================================================
FILE: core/strategies/base.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from abc import abstractmethod


@abstractmethod
class BaseStrategy:
    def __init__(self, mode, config, model_path=None):
        '''Federated learning strategy

        Args:
            mode (str): which part the instantiated object should play,
                typically either :code:`client` or :code:`server`.
            config (dict): initial config dict.
            model_path (str): where to find model, needed for debugging only.
        '''
        pass

    def generate_client_payload(self, trainer):
        '''Generate client payload

        Args:
            trainer (core.Trainer object): trainer on client.

        Returns:
            dict containing payloads in some specified format.
        '''
        pass

    def process_individual_payload(self, worker_trainer, payload):
        '''Process client payload
        
        Args:
            worker_trainer (core.Trainer object): trainer on server
                (aka model updater).
            payload (dict): whatever is generated by
                :code:`generate_client_payload`.

        Returns:
            True if processed succesfully, False otherwise.
        '''
        pass

    def combine_payloads(self, worker_trainer, curr_iter, num_clients_curr_iter, total_clients, client_stats, logger=None):
        '''Combine payloads to update model
        
        Args:
            worker_trainer (core.Trainer object): trainer on server
                (aka model updater).
            curr_iter (int): current iteration.
            num_clients_curr_iter (int): number of clients on current iteration.
            total_clients (int): size of total pool of clients (for privacy accounting)
            client_stats (dict): stats being collected.
            logger (callback): function called to log quantities.
        '''
        pass

================================================
FILE: core/strategies/dga.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import copy
import json
import logging
import math
import os

import numpy as np
import torch

from extensions import privacy, RL, quant_model
from utils import compute_grad_cosines, print_rank, to_device
from core.strategies import BaseStrategy
from core.strategies.utils import (
    aggregate_gradients_inplace,
    filter_weight,
)

from azureml.core import Run
run = Run.get_context()

MIN_WEIGHT = 1e-7


class DGA(BaseStrategy):
    '''Dynamic Gradient Aggregation'''

    def __init__(self, mode, config, model_path=None):
        ''' Dynamic Gradient Aggregation (DGA) strategy.

        For more info see arXiv:2106.07578.

        Args:
            mode (str): which part the instantiated object should play,
                typically either :code:`client` or :code:`server`.
            config (dict): initial config dict.
            model_path (str): where to find model, needed for debugging only.
        '''

        super().__init__(mode=mode, config=config, model_path=model_path)

        if mode not in ['client', 'server']:
            raise ValueError('mode in strategy must be either `client` or `server`')

        self.config = config
        self.model_path = model_path
        self.mode = mode

        # Parse config
        self.model_config = config['model_config']
        self.client_config = config['client_config']
        self.server_config = config['server_config']

        self.dp_config = config.get('dp_config', None)

        if mode == 'client':
            self.stats_on_smooth_grad = self.client_config.get('stats_on_smooth_grad', False)
            self.quant_threshold = self.client_config.get('quant_thresh', None)
            self.quant_bits = self.client_config.get('quant_bits', 10)
        elif mode == 'server':
            self.dump_norm_stats = self.config.get('dump_norm_stats', False)
            self.aggregate_fast = self.server_config.get('fast_aggregation', False)
            self.want_rl = self.server_config.get('wantRL', False)
            self.stale_prob = self.server_config.get('stale_prob', 0.0)

            self.skip_model_update = False

            # Do some checks and create objects based on configs
            if self.aggregate_fast:
                print_rank('It is NOT possible to enable RL with fast_aggregation, RL is set to False', loglevel=logging.INFO)
                self.want_rl = False

                print_rank('It is NOT possible in Current Implementation to have stale gradients with fast_aggregation, stale_prob is set to 0.0', loglevel=logging.INFO)
                self.stale_prob = 0.0

            if self.want_rl:
                self.rl = RL(config=self.server_config)

            # Initialize accumulators
            self.client_parameters_stack = []
            self.client_parameters_stack_stale = []
            self.client_weights = []

            self.weight_sum_stale = 0.0

    def generate_client_payload(self, trainer):
        '''Generate client payload

        Args:
            trainer (core.Trainer object): trainer on client.

        Returns:
            dict containing payloads in some specified format.
        '''

        if self.mode != 'client':
            raise RuntimeError('this method can only be invoked by the client')

        # Get weights for aggregation, potentially using DGA
        weight = 1.0
        add_weight_noise = False

        # Reset gradient stats and recalculate them on the smooth/pseudo gradient
        if self.stats_on_smooth_grad:
            trainer.reset_gradient_power()
            trainer.estimate_sufficient_stats()

        # If we are using softmax based on training loss, it needs DP noise
        if self.config['server_config']['aggregate_median'] == 'softmax':
            # This matters when DP is required
            add_weight_noise = True

            if 'weight_train_loss' not in self.config['server_config'] or self.config['server_config']['weight_train_loss'] == 'train_loss':
                training_weight = trainer.train_loss / trainer.num_samples
            elif self.config['server_config']['weight_train_loss'] == 'mag_var_loss':
                training_weight = trainer.sufficient_stats['var']
            elif self.config['server_config']['weight_train_loss'] == 'mag_mean_loss':
                training_weight = trainer.sufficient_stats['mean']
            else:
                training_weight = trainer.sufficient_stats['mag']

            try:
                weight = math.exp(-self.config['server_config']['softmax_beta'] * training_weight)
            except:
                print_rank('There is an issue with the weight -- Reverting to {}'.format(MIN_WEIGHT), loglevel=logging.DEBUG)
                weight = MIN_WEIGHT
            weight = filter_weight(weight)

        # Add local DP noise here.
        # When weight == 0, something went wrong. So we'll skip adding noise and return a zero gradient.
        if weight > 0.0 and self.dp_config is not None and self.dp_config.get('enable_local_dp', False):
            weight = privacy.apply_local_dp(trainer, weight, self.dp_config, add_weight_noise)

        # In all other cases we can compute the weight after adding noise
        if not add_weight_noise:
            assert self.config['server_config']['aggregate_median'] == 'mean'
            assert weight == 1.0

        # Weight the gradient and remove gradients of the layers we want to freeze
        for n, p in trainer.model.named_parameters():
            p.grad = weight * p.grad
            if self.model_config.get('freeze_layer', None) and n == self.model_config['freeze_layer']:
                print_rank('Setting gradient to zero for layer: {}'.format(n), loglevel=logging.INFO)
                p.grad.mul_(0)

        # Gradient quantization step -- if quant_threshold is None, the code returns without doing anything
        quant_model(trainer.model, quant_threshold=self.quant_threshold, quant_bits=self.quant_bits, global_stats=False)

        payload = {}
        payload['weight'] = weight
        payload['gradients'] = [p.grad.to(torch.device('cpu')) for p in trainer.model.parameters()]

        return payload

    def process_individual_payload(self, worker_trainer, payload):
        '''Process client payload

        Args:
            worker_trainer (core.Trainer object): trainer on server
                (aka model updater).
            payload (dict): whatever is generated by
                :code:`generate_client_payload`.

        Returns:
            True if processed succesfully, False otherwise.
        '''

        if self.mode != 'server':
            raise RuntimeError('this method can only be invoked by the server')

        if payload['weight'] == 0.0:
            return False

        self.client_weights.append(payload['weight'])
        if self.aggregate_fast:
            aggregate_gradients_inplace(worker_trainer.model, payload['gradients'])
        else:
            self.client_parameters_stack.append(payload['gradients'])
        return True

    def combine_payloads(self, worker_trainer, curr_iter, num_clients_curr_iter, total_clients, client_stats, logger=None):
        '''Combine payloads to update model

        Args:
            worker_trainer (core.Trainer object): trainer on server
                (aka model updater).
            curr_iter (int): current iteration.
            num_clients_curr_iter (int): number of clients on current iteration.
            total_clients (int): size of total pool of clients (for privacy accounting)
            client_stats (dict): stats being collected.
            logger (callback): function called to log quantities.

        Returns:
            losses, computed for use with LR scheduler.
        '''

        if self.mode != 'server':
            raise RuntimeError('this method can only be invoked by the server')

        if self.want_rl:
            rl_model = self._run_rl_inference(self.client_weights, *client_stats)

        # Aggregation step
        if self.dump_norm_stats:
            cps_copy = [[g.clone().detach() for g in x] for x in self.client_parameters_stack]
        weight_sum = self._aggregate_gradients(worker_trainer, num_clients_curr_iter, self.client_weights, metric_logger=logger)
        print_rank('Sum of weights: {}'.format(weight_sum), loglevel=logging.DEBUG)

        torch.cuda.empty_cache()

        # Normalize with weight_sum
        for p in worker_trainer.model.parameters():
            p.grad /= weight_sum

        if self.dump_norm_stats:
            cosines = compute_grad_cosines(cps_copy, [p.grad.clone().detach() for p in worker_trainer.model.parameters()])
            with open(os.path.join(self.model_path, 'cosines.txt'), 'a', encoding='utf-8') as outfile:
                outfile.write('{}\n'.format(json.dumps(cosines)))

        # DP-specific steps
        privacy.apply_global_dp(self.config, worker_trainer.model, num_clients_curr_iter=num_clients_curr_iter, select_grad=True, metric_logger=logger)
        eps = privacy.update_privacy_accountant(self.config, total_clients, curr_iter=curr_iter, num_clients_curr_iter=num_clients_curr_iter)
        if eps:
            print_rank(f'DP result: {eps}')

        if self.skip_model_update is True:
            print_rank('Skipping model update')
            return

        # Run optimization with gradient/model aggregated from clients
        print_rank('Updating model')
        worker_trainer.update_model()
        print_rank('Updating learning rate scheduler')
        losses = worker_trainer.run_lr_scheduler(force_run_val=False)

        if self.want_rl:
            self._run_rl_training(curr_iter, rl_model, self.client_weights, *client_stats, logger)

        return losses

    def _aggregate_gradients(self, worker_trainer, num_clients_curr_iter, client_weights, metric_logger=None):
        '''Go through stored gradients, aggregate and put them inside model.

        Args:
            num_clients_curr_iter (int): how many clients were processed.
            client_weights: weight for each client.
            metric_logger (callback, optional): callback used for logging.
                Defaults to None, in which case AML logger is used.

        Returns:
            float: sum of weights for all clients.
        '''

        weight_sum = 0
        if metric_logger is None:
            metric_logger = run.log

        if not self.aggregate_fast:
            metric_logger('Stale Gradients Ratio', len(self.client_parameters_stack_stale) / num_clients_curr_iter)
            if len(self.client_parameters_stack_stale) > 0:
                weight_sum = self.weight_sum_stale
                for client_parameters in self.client_parameters_stack_stale:
                    # Model parameters are already multiplied with weight on client, we only have to sum them up
                    aggregate_gradients_inplace(worker_trainer.model, client_parameters)
                self.client_parameters_stack_stale = []
                self.weight_sum_stale = 0

            for client_weight, client_parameters in zip(client_weights, self.client_parameters_stack):
                if np.random.random() > self.stale_prob:
                    # Model parameters are already multiplied with weight on client, we only have to sum them up
                    aggregate_gradients_inplace(worker_trainer.model, client_parameters)
                else:
                    self.weight_sum_stale += client_weight
                    self.client_parameters_stack_stale.append(client_parameters)

        weight_sum += sum(client_weights) - self.weight_sum_stale

        # Some cleaning
        self.client_parameters_stack = []
        self.client_weights = []

        return weight_sum

    def _run_rl_inference(self, client_weights, client_mag_grads, client_mean_grads, client_var_grads):
        '''Uses RL to estimate weights, using DGA.

        Args:
            client_weights (numpy.ndarray): original weights for aggregation.
            client_mag_grads (numpy.ndarray): gradient stats for RL (magnitudes).
            client_mean_grads (numpy.ndarray): gradient stats for RL (means).
            client_var_grads (numpy.ndarray): gradient stats for RL (vars).

        Returns:
            list of torch.Tensor: parameters of model used to perform RL.
        '''

        weight_sum = 0
        original_model = copy.copy([p for p in self.worker_trainer.model.parameters()])

        # Reinforcement learning for estimating weights
        print_rank('RL estimation of the aggregation weights', loglevel=logging.INFO)
        rl_weights = self.rl.forward(
            np.concatenate((client_weights, client_mag_grads, client_mean_grads, client_var_grads), axis=0)).cpu().detach().np()
        if rl_weights.ndim > 1:
            rl_weights = rl_weights[-1, :]
        rl_weights = np.exp(rl_weights)

        print_rank('RL Weights BEFORE filtering: {}'.format(rl_weights), loglevel=logging.DEBUG)
        index = np.argwhere(np.isnan(rl_weights))
        rl_weights[index] = 0
        index = np.argwhere(np.isinf(rl_weights))
        rl_weights[index] = 0
        print_rank('RL Weights AFTER filtering: {}'.format(rl_weights), loglevel=logging.DEBUG)

        for client_parameters, orig_weight, rl_weight in zip(self.client_parameters_stack, client_weights, rl_weights):
            # Model parameters are already multiplied with weight on client, we only have to sum them up
            for p, client_grad in zip(self.worker_trainer.model.parameters(), client_parameters):
                if p.grad is None:
                    p.grad = to_device(client_grad) * rl_weight / orig_weight
                else:
                    p.grad += to_device(client_grad) * rl_weight / orig_weight
            weight_sum += rl_weight

        # Normalize with weight_sum
        for p in self.worker_trainer.model.parameters():
            p.grad /= weight_sum
        
        # Run optimization with gradient/model aggregated from clients
        self.worker_trainer.update_model()

        # Get the validation result back
        (rl_val_loss, rl_val_acc) = self.worker_trainer.run_lr_scheduler(force_run_val=True)

        # Save model and revert to previous one
        rl_model = copy.copy([p.data for p in self.worker_trainer.model.parameters()])
        for p, p_ in zip(self.worker_trainer.model.parameters(), original_model):
            p.data = p_.data.detach().clone()

        # Set the current set of weights
        self.rl.set_weights(rl_weights)
        self.rl.set_losses((rl_val_loss, rl_val_acc))

        # Return the resulting RL-based model
        return rl_model

    def _run_rl_training(self, iter, rl_model, client_weights, client_mag_grads, client_mean_grads, client_var_grads, metric_logger):
        '''Trains RL for estimating weights, following DGA recipe.
        
        Args:
            iter (int): current iteration.
            rl_model (list of torch.Tensor): parameters of model used to perform RL.
            client_weights (numpy.ndarray): original weights for aggregation.
            client_mag_grads (numpy.ndarray): gradient stats for RL (magnitudes).
            client_mean_grads (numpy.ndarray): gradient stats for RL (means).
            client_var_grads (numpy.ndarray): gradient stats for RL (vars).
            metric_logger (callback, optional): callback used for logging.
                Defaults to None, in which case AML logger is used.
        '''

        # Get the validation result back
        if None in self.losses:
            self.losses = self.run_distributed_inference(mode='val')

        # Expected structure of batch
        print_rank('Performing RL training on the aggregation weights')
        if abs(self.losses[1] - self.rl.rl_losses[1]) < 0.001:
            reward = 0.1
            print_rank(
                'Iter:{}  val_ACC={}  rl_val_ACC={}  reward={}'.format(iter, self.losses[1], self.rl.rl_losses[1], reward))
            if 'marginal_update_RL' in self.config['server_config'] and \
                    self.config['server_config']['marginal_update_RL']:
                self.losses = self.rl.rl_losses
                for p, p_ in zip(self.worker_trainer.model.parameters(), rl_model):
                    p.data= p_.data.detach().clone()

        elif (self.losses[1] - self.rl.rl_losses[1]) > 0:
            reward = 1.0
            print_rank(
                'Iter:{}  val_ACC={}  rl_val_ACC={}  reward={}'.format(iter, self.losses[1], self.rl.rl_losses[1], reward))
            self.losses = self.rl.rl_losses
            for p, p_ in zip(self.worker_trainer.model.parameters(), rl_model):
                p.data = p_.data.detach().clone()

        else:
            reward = -1.0
            print_rank(
                'Iter:{}  val_ACC={}  rl_val_ACC={}  reward={}'.format(iter, self.losses[1], self.rl.rl_losses[1], reward))

        # Taking the policy from a game-based RL
        batch = (
            (np.concatenate((client_weights, client_mag_grads, client_mean_grads, client_var_grads), axis=0)),
            (self.rl.rl_weights),
            [reward]
        )

        print_rank('RL Model Update -- Training')
        self.rl.train(batch)

        print_rank('RL State Saving')
        self.rl.save(iter)

        print_rank('RL logging')
        metric_logger('RL Running Loss', self.rl.runningLoss)
        metric_logger('RL Rewards', reward)

================================================
FILE: core/strategies/fedavg.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import json
import logging
import os

import torch

from utils import compute_grad_cosines, print_rank
from core.strategies import BaseStrategy
from core.strategies.utils import (
    aggregate_gradients_inplace,
)

from azureml.core import Run
run = Run.get_context()


class FedAvg(BaseStrategy):
    '''Federated Averaging'''

    def __init__(self, mode, config, model_path=None):
        '''Federated Averaging strategy.

        Args:
            mode (str): which part the instantiated object should play,
                typically either :code:`client` or :code:`server`.
            config (dict): initial config dict.
            model_path (str): where to find model, needed for debugging only.
        '''

        super().__init__(mode=mode, config=config, model_path=model_path)

        if mode not in ['client', 'server']:
            raise ValueError('mode in strategy must be either `client` or `server`')

        self.config = config
        self.model_path = model_path
        self.mode = mode

        # Parse config
        self.model_config = config['model_config']
        self.client_config = config['client_config']
        self.server_config = config['server_config']

        self.dp_config = config.get('dp_config', None)

        if mode == 'client':
            self.stats_on_smooth_grad = self.client_config.get('stats_on_smooth_grad', False)
        elif mode == 'server':
            self.dump_norm_stats = self.config.get('dump_norm_stats', False)
            self.aggregate_fast = self.server_config.get('fast_aggregation', False)

            self.skip_model_update = False

            # Initialize accumulators
            self.client_parameters_stack = []
            self.client_weights = []

    def generate_client_payload(self, trainer):
        '''Generate client payload

        Args:
            trainer (core.Trainer object): trainer on client.

        Returns:
            dict containing payloads in some specified format.
        '''

        if self.mode != 'client':
            raise RuntimeError('this method can only be invoked by the client')

        # Reset gradient stats and recalculate them on the smooth/pseudo gradient
        if self.stats_on_smooth_grad:
            trainer.reset_gradient_power()
            trainer.estimate_sufficient_stats()

        # Weight the gradient and remove gradients of the layers we want to freeze
        weight = trainer.num_samples
        for n, p in trainer.model.named_parameters():
            p.grad = weight * p.grad
            if self.model_config.get('freeze_layer', None) and n == self.model_config['freeze_layer']:
                print_rank('Setting gradient to zero for layer: {}'.format(n), loglevel=logging.INFO)
                p.grad.mul_(0)

        payload = {}
        payload['weight'] = weight
        payload['gradients'] = [p.grad.to(torch.device('cpu')) for p in trainer.model.parameters()]

        return payload

    def process_individual_payload(self, worker_trainer, payload):
        '''Process client payload

        Args:
            worker_trainer (core.Trainer object): trainer on server
                (aka model updater).
            payload (dict): whatever is generated by
                :code:`generate_client_payload`.

        Returns:
            True if processed succesfully, False otherwise.
        '''

        if self.mode != 'server':
            raise RuntimeError('this method can only be invoked by the server')

        if payload['weight'] == 0.0:
            return False

        self.client_weights.append(payload['weight'])
        if self.aggregate_fast:
            aggregate_gradients_inplace(worker_trainer.model, payload['gradients'])
        else:
            self.client_parameters_stack.append(payload['gradients'])
        return True

    def combine_payloads(self, worker_trainer, curr_iter, num_clients_curr_iter, total_clients, client_stats, logger=None):
        '''Combine payloads to update model

        Args:
            worker_trainer (core.Trainer object): trainer on server
                (aka model updater).
            curr_iter (int): current iteration.
            num_clients_curr_iter (int): number of clients on current iteration.
            client_stats (dict): stats being collected.
            logger (callback): function called to log quantities.

        Returns:
            losses, computed for use with LR scheduler.
        '''

        if self.mode != 'server':
            raise RuntimeError('this method can only be invoked by the server')

        # Aggregation step
        if self.dump_norm_stats:
            cps_copy = [[g.clone().detach() for g in x] for x in self.client_parameters_stack]
        weight_sum = self._aggregate_gradients(worker_trainer, num_clients_curr_iter, self.client_weights, metric_logger=logger)
        print_rank('Sum of weights: {}'.format(weight_sum), loglevel=logging.DEBUG)

        torch.cuda.empty_cache()

        # Normalize with weight_sum
        for p in worker_trainer.model.parameters():
            p.grad /= weight_sum

        if self.dump_norm_stats:
            cosines = compute_grad_cosines(cps_copy, [p.grad.clone().detach() for p in worker_trainer.model.parameters()])
            with open(os.path.join(self.model_path, 'cosines.txt'), 'a', encoding='utf-8') as outfile:
                outfile.write('{}\n'.format(json.dumps(cosines)))

        if self.skip_model_update is True:
            print_rank('Skipping model update')
            return

        # Run optimization with gradient/model aggregated from clients
        print_rank('Updating model')
        worker_trainer.update_model()
        print_rank('Updating learning rate scheduler')
        losses = worker_trainer.run_lr_scheduler(force_run_val=False)

        # TODO: Global DP. See dga.py

        return losses

    def _aggregate_gradients(self, worker_trainer, num_clients_curr_iter, client_weights, metric_logger=None):
        '''Go through stored gradients, aggregate and put them inside model.

        Args:
            num_clients_curr_iter (int): how many clients were processed.
            client_weights: weight for each client.
            metric_logger (callback, optional): callback used for logging.
                Defaults to None, in which case AML logger is used.

        Returns:
            float: sum of weights for all clients.
        '''

        if metric_logger is None:
            metric_logger = run.log

        if not self.aggregate_fast:
            for client_parameters in self.client_parameters_stack:
                # Model parameters are already multiplied with weight on client, we only have to sum them up
                aggregate_gradients_inplace(worker_trainer.model, client_parameters)
        weight_sum = sum(client_weights)

        # Some cleaning
        self.client_parameters_stack = []
        self.client_weights = []

        return weight_sum

================================================
FILE: core/strategies/fedlabels.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import json
import logging
import os

import torch
import numpy as np
from azureml.core import Run

from core.strategies import BaseStrategy
from utils import (
    compute_grad_cosines, 
    print_rank, 
    to_device)

run = Run.get_context()

class FedLabels(BaseStrategy):
    '''FedLabels: Semi-supervision strategy.'''

    def __init__(self, mode, config, model_path=None):
        '''
        Args:
            mode (str): which part the instantiated object should play,
                typically either :code:`client` or :code:`server`.
            config (dict): initial config dict.
            model_path (str): where to find model, needed for debugging only.
        '''

        super().__init__(mode=mode, config=config, model_path=model_path)

        if mode not in ['client', 'server']:
            raise ValueError('mode in strategy must be either `client` or `server`')

        self.config = config
        self.model_path = model_path
        self.mode = mode
        self.model_config = config['model_config']
        self.client_config = config['client_config']
        self.server_config = config['server_config']
        self.dp_config = config.get('dp_config', None)

        self.tmp_sup = None
        self.tmp_unsup = None

        if mode == 'client':
            self.stats_on_smooth_grad = self.client_config.get('stats_on_smooth_grad', False)
        elif mode == 'server':
            self.dump_norm_stats = self.config.get('dump_norm_stats', False)
            self.aggregate_fast = self.server_config.get('fast_aggregation', False)

            self.skip_model_update = False

            # Initialize accumulators
            self.client_parameters_stack = []
            self.client_weights = []

    def generate_client_payload(self, trainer):
        '''Generate client payload

        Args:
            trainer (core.Trainer object): trainer on client.
            unsup_dict (dict): unsupervised model state dictionary
            iteration (int): training round
            total_est_labels (int): labels generated

        Returns:
            dict containing payloads in some specified format.
        '''

        unsup_dict = trainer.algo_computation

        if self.mode != 'client':
            raise RuntimeError('this method can only be invoked by the client')

        # Reset gradient stats and recalculate them on the smooth/pseudo gradient
        if self.stats_on_smooth_grad:
            trainer.reset_gradient_power()
            trainer.estimate_sufficient_stats()

        # Weight the gradient and preprocess state dictionaries from supervised and unsupervised model
        weight = 1 if trainer.num_samples == 0 else trainer.num_samples
        unsup_grads = [unsup_dict[param_tensor].to(torch.device('cpu')) for param_tensor in unsup_dict.keys()]
        sup_grads = [trainer.model.state_dict()[param_tensor].to(torch.device('cpu')) for param_tensor in trainer.model.state_dict().keys()]

        payload = {}
        payload['weight'] = weight
        payload['gradients'] = sup_grads + unsup_grads

        return payload

    def process_individual_payload(self, worker_trainer, payload):
        '''Process client payload

        Args:
            worker_trainer (core.Trainer object): trainer on server
                (aka model updater).
            payload (dict): whatever is generated by
                :code:`generate_client_payload`.

        Returns:
            True if processed succesfully, False otherwise.
        '''

        if self.mode != 'server':
            raise RuntimeError('this method can only be invoked by the server')

        if payload['weight'] == 0.0:
            return False

        self.client_weights.append(payload['weight'])
        if self.aggregate_fast:
            aggregate_gradients_inplace(worker_trainer.model, payload['gradients'])
        else:
            self.client_parameters_stack.append(payload['gradients'])
        return True

    def combine_payloads(self, worker_trainer, curr_iter, num_clients_curr_iter, total_clients, client_stats, logger=None):
        '''Combine payloads to update model

        Args:
            worker_trainer (core.Trainer object): trainer on server
                (aka model updater).
            curr_iter (int): current iteration.
            num_clients_curr_iter (int): number of clients on current iteration.
            client_stats (dict): stats being collected.
            logger (callback): function called to log quantities.

        Returns:
            losses, computed for use with LR scheduler.
        '''

        if self.mode != 'server':
            raise RuntimeError('this method can only be invoked by the server')

        # Aggregation step
        if self.dump_norm_stats:
            cps_copy = [[g.clone().detach() for g in x] for x in self.client_parameters_stack]
        weight_sum, self.tmp_sup, self.tmp_unsup = self._aggregate_gradients(worker_trainer, num_clients_curr_iter, self.client_weights, metric_logger=logger)
        print_rank('Sum of weights: {}'.format(weight_sum), loglevel=logging.DEBUG)
        torch.cuda.empty_cache()

        # Disjoint aggregation
        tmp_both = {}
        for param_key in self.tmp_unsup.keys():
                tmp_both[param_key] = self.tmp_sup[param_key]/2 + self.tmp_unsup[param_key]/2
        worker_trainer.model.load_state_dict(tmp_both)
        
        if self.dump_norm_stats:
            cosines = compute_grad_cosines(cps_copy, [p.grad.clone().detach() for p in worker_trainer.model.parameters()])
            with open(os.path.join(self.model_path, 'cosines.txt'), 'a', encoding='utf-8') as outfile:
                outfile.write('{}\n'.format(json.dumps(cosines)))

        if self.skip_model_update is True:
            print_rank('Skipping model update')
            return

        # Run optimization with gradient/model aggregated from clients
        print_rank('Updating model')
        worker_trainer.update_model()
        print_rank('Updating learning rate scheduler')
        losses = worker_trainer.run_lr_scheduler(force_run_val=False)

        # TODO: Global DP. See dga.py

        return losses

    def _aggregate_gradients(self, worker_trainer, num_clients_curr_iter, client_weights, metric_logger=None):
        '''Go through stored gradients, aggregate and put them inside model.

        Args:
            num_clients_curr_iter (int): how many clients were processed.
            client_weights: weight for each client.
            metric_logger (callback, optional): callback used for logging.
                Defaults to None, in which case AML logger is used.

        Returns:
            float: sum of weights for all clients.
            dict: supervised model state dictionary.
            dict: unsupervised model state dicionary.
        '''

        if metric_logger is None:
            metric_logger = run.log

        # Separate sup/unsup dictionaries from client payload
        sup_slice = int(len(self.client_parameters_stack[0])/2)
        keys = [key for key in worker_trainer.model.state_dict()]
        model_dicts = [client_dict[:sup_slice] for client_dict in self.client_parameters_stack]
        unsup_dicts = [client_dict[sup_slice:] for client_dict in self.client_parameters_stack]

        first = True
        tmp_sup, tmp_unsup = {}, {}

        # Compute radios for each model
        weight_sum = sum(client_weights)
        ratio_sup = 1/len(client_weights)
        ratio_unsup = np.array(client_weights)/weight_sum

        if not self.aggregate_fast:
            # Perform aggregation for supervised model
            for i, client_parameters in enumerate(model_dicts):
                first, tmp_sup = aggregate_gradients_inplace(keys, client_parameters, first, tmp_sup, ratio_sup)
            first = True
            
            # Perform aggregation for unsupervised model
            for j, client_parameters in enumerate(unsup_dicts):
                first, tmp_unsup = aggregate_gradients_inplace(keys, client_parameters, first, tmp_unsup, ratio_unsup[j])
        
        # Some cleaning
        self.client_parameters_stack = []
        self.client_weights = []

        return weight_sum, tmp_sup, tmp_unsup

def aggregate_gradients_inplace(keys, values, first, tmp, ratio):
    '''Aggregate list of tensors into model dictionary.

    Args:
        keys (list): state dictionary keys of model to which dictionaries will be summed.
        values (list): list of values to sum to model dictionary.
        first (bool): flag that indicates the first value in the dictionary.
        tmp (dict): model state dictionary that will be summed.
        ratio (float): radio to weight each client value.
    '''

    for param_key, client_dict in zip (keys, values):
        if first:
            tmp[param_key] = to_device(client_dict) * ratio
        else:
            tmp[param_key] += to_device(client_dict) * ratio

    return False, tmp

================================================
FILE: core/strategies/utils.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging

import numpy as np

from utils import print_rank, to_device


def filter_weight(weight):
    '''Handles aggregation weights if something messed them up'''
    print_rank('Client Weight BEFORE filtering: {}'.format(weight), loglevel=logging.DEBUG)
    if np.isnan(weight) or not np.isfinite(weight):
        weight = 0.0
    elif weight > 100:
        weight = 100
    print_rank('Client Weights AFTER filtering: {}'.format(weight), loglevel=logging.DEBUG)
    return weight

def aggregate_gradients_inplace(model, gradients):
    '''Aggregate list of tensors into model gradients.

    Args:
        model (torch.nn.Module): model to which gradients will be summed.
        gradients (list): list of gradients to sum to model.
    '''

    for p, client_grad in zip(model.parameters(), gradients):
        if p.grad is None:
            p.grad = to_device(client_grad)
        else:
            p.grad += to_device(client_grad)

================================================
FILE: core/trainer.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
import os
import re
import copy 

import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from core.metrics import Metrics
from utils import \
    get_lr, \
    get_lr_all, \
    make_optimizer, \
    make_lr_scheduler, \
    print_rank, \
    torch_save, \
    try_except_save, \
    write_yaml
from utils.utils import (
    to_device, 
    get_label_VAT)

class TrainerBase:
    """Abstract class defining Trainer objects' common interface.

    Args:
        model (torch.nn.Module): model to be trained.
        train_dataloader (torch.utils.data.DataLoader): dataloader that
            provides the training data.
        optimizer: (torch.optim.Optimizer): optimizer that will be used to
            update the model.
        max_grad_norm (float): if not None, avg gradients are clipped to this
            norm; defaults to None.
        ignore_subtask (bool): ignore subtasks, defaults to True.
        model_type (str): what kind of model is used, defaults to
            :code:`LanguageModel`.
        decoder_config (dict or None): config for decoder, defaults to None.
    """

    def __init__(
        self,
        model,
        train_dataloader,
        optimizer,
        max_grad_norm=None,
        ignore_subtask=True,
        model_type="LanguageModel",
        decoder_config=None
    ):

        self.model = model
        self.train_dataloader = train_dataloader
        self.optimizer = optimizer
        self.max_grad_norm = max_grad_norm
        self.model_type = model_type
        self.decoder_config = decoder_config

        self.step = 0  # count how many batches are processed
        self.ignore_subtask = ignore_subtask  # ignore subtasks even if there are multiple task branches

    def epoch_boundary(self):
        '''Check if we are at the end of any given epoch.'''
        return self.step % len(self.train_dataloader.create_loader()) == 0 and self.step != 0

    def train_desired_samples(self, desired_max_samples, apply_privacy_metrics):
        pass

    def save(self):
        pass

    def load(self):
        pass


class ModelUpdater(TrainerBase):
    """Update the model, given the already computed gradient.

    This is a special kind of trainer, that actually does not use any data.

    Args:
        model (torch.nn.Module): model to be updated.
        optimizer (torch.optim.Optimizer): optimizer that will be used to
            update the model.
        ss_scheduler: scheduled sampler.
        train_dataloader: train dataloader, this is not actually used.
        val_dataloader: val dataloader, this is not actually used.
        max_grad_norm (float): avg gradients are clipped to this norm.
        anneal_config (dict): annealing configuration.
        model_type (str): what kind of model is used, defaults to
            :code:`LanguageModel`.
        decoder_config (dict): config for decoder, defaults to None.
    """

    def __init__(
        self,
        model,
        optimizer,
        ss_scheduler,
        train_dataloader,
        val_dataloader,
        max_grad_norm,
        anneal_config,
        model_type="LanguageModel",
        decoder_config=None
    ):
        super().__init__(
            model=model,
            train_dataloader=train_dataloader,
            optimizer=optimizer,
            max_grad_norm=max_grad_norm,
            model_type=model_type,
            decoder_config=decoder_config
        )

        self.val_dataloader = val_dataloader
        self.annealing_type = anneal_config["type"] if anneal_config is not None else None
        self.lr_scheduler = make_lr_scheduler(anneal_config, self.optimizer)
        self.ss_scheduler = ss_scheduler

    def update_model(self):
        """Update model parameters using pre-computed gradients."""

        # Apply gradient clipping
        if self.max_grad_norm is not None:
            grad_norm = nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
            print_rank(f"clipped norm: {grad_norm} to {min(grad_norm,self.max_grad_norm)}", logging.DEBUG)

        # Do optimizer step
        self.optimizer.step()
        self.optimizer.zero_grad()

    def run_lr_scheduler(self, force_run_val=False):
        """Update learning rate using scheduler."""

        val_loss = val_acc = None
        if force_run_val is True or self.annealing_type == "val_loss":
            _, val_loss, val_acc = run_validation_generic(self.model, self.val_dataloader)

        # Do LR scheduling
        print_rank(f"LR all: {list(get_lr_all(self.optimizer))}", loglevel=logging.DEBUG)
        print_rank("LR BEFORE lr_scheduler step: {}".format(get_lr(self.optimizer)))
        if self.annealing_type == "val_loss":
            self.lr_scheduler.step(val_loss)
        else:
            self.lr_scheduler.step()
        print_rank("LR AFTER lr_scheduler step: {}".format(get_lr(self.optimizer)), loglevel=logging.DEBUG)

        return (val_loss, val_acc)

    def run_ss_scheduler(self):
        """Do scheduled sampling."""

        if self.ss_scheduler is not None:
            self.ss_scheduler.step()

    def save(self, model_path, token=None, config=None):
        """Save model to disk."""

        save_model(
            model_path=model_path,
            config=config,
            model=self.model,
            optimizer=self.optimizer,
            lr_scheduler=self.lr_scheduler,
            ss_scheduler=self.ss_scheduler,
            token=token
        )

    def load(self, save_path, update_lr_scheduler, update_ss_scheduler):
        """Load model from disk.

        If save_path is given, load from there. If not, then resume training
        from current model dir.  If at any point the save_path is not present on
        the disk, it won't be loaded.
        """

        if os.path.isfile(save_path):
            print_rank("Loading checkpoint: {}".format(save_path))
            checkpoint = torch.load(save_path)
            self.model.load_state_dict(checkpoint["model_state_dict"])
            if self.optimizer is not None:
                self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

            anl_st_dict = checkpoint.get("lr_scheduler_state_dict")
            if anl_st_dict and self.lr_scheduler is not None and update_lr_scheduler is True:
                self.lr_scheduler.load_state_dict(anl_st_dict)

            sss_st_dict = checkpoint.get("ss_scheduler_state_dict")
            if sss_st_dict and self.ss_scheduler is not None and update_lr_scheduler is True:
                self.ss_scheduler.load_state_dict(sss_st_dict)


class Trainer(TrainerBase):
    """Perform training step for any given client.

    The main method to be called for triggering a training step is
    :code:`train_desired_samples`, which on its turn relies on
    :code:`run_train_epoch`.

    Args:
        model (torch.nn.Module): model to be trained.
        ss_scheduler: scheduled sampler.
        train_dataloader (torch.data.utils.DataLoader): dataloader that
            provides the training data.
        server_replay_config (dict or None): config for replaying training;
            defaults to None, in which case no replaying happens.
        optimizer (torch.optim.Optimizer or None): optimizer that will be used
            to update the model. If :code:`None`, skip optimization.
        max_grad_norm (float or None): if not None, avg gradients are clipped
            to this norm; defaults to None.
        anneal_config (dict or None): annealing configuration.
        num_skips_threshold (int): previously used to skip users, deprecated.
        ignore_subtask (bool): ignore subtasks, defaults to True.
    """

    def __init__(
        self,
        model,
        ss_scheduler,
        train_dataloader,
        server_replay_config=None,
        optimizer=None,
        max_grad_norm=None,
        anneal_config=None,
        num_skips_threshold=-1,
        ignore_subtask=True
    ):
        super().__init__(
            model=model,
            train_dataloader=train_dataloader,
            optimizer=optimizer,
            max_grad_norm=max_grad_norm,
            ignore_subtask=ignore_subtask
        )

        self.server_replay_config=None
        if server_replay_config is not None:
            self.server_replay_config = server_replay_config

        self.anneal_config=None
        if anneal_config is not None:
            self.anneal_config = anneal_config

        self.lr_scheduler = None
        if self.optimizer is None and self.server_replay_config is not None and "optimizer" in self.server_replay_config:
            self.optimizer = make_optimizer(self.server_replay_config["optimizer_config"], model)

        if self.optimizer is not None and self.anneal_config is not None:
            self.lr_scheduler = make_lr_scheduler(
                                                self.anneal_config,
                                                self.optimizer)

        self.cached_batches = []
        self.ss_scheduler = ss_scheduler

    def reset_gradient_power(self):
        """Reset the sum of gradient power.

        This is used to compute statistics about the gradients.
        """

        self.sum_grad = self.sum_grad2 = self.counter = 0

    def accumulate_gradient_power(self):
        """Compute sum of gradient power.

        This is used to compute statistics about the gradients.
        """

        for p in self.model.parameters():
            if p.grad is None:
                continue

            grad = p.grad.detach().clone().cpu().numpy()
            p1 = np.sum(grad)
            p2 = np.sum(grad ** 2)
            n = p.grad.numel()

            self.sum_grad += p1
            self.sum_grad2 += p2
            self.counter += n

        print_rank("Magn. Grad. Squared: {}".format(self.sum_grad2), loglevel=logging.DEBUG)
        print_rank("Magn. Grad.: {}".format(self.sum_grad), loglevel=logging.DEBUG)
        return self.sum_grad, self.sum_grad2, self.counter

    def estimate_sufficient_stats(self):
        """Compute statistics about the gradients."""

        sum_mean_grad, sum_mean_grad2, n = self.accumulate_gradient_power()

        mean_grad = sum_mean_grad / n
        mag_grad = np.sqrt(sum_mean_grad2 / n)
        var_grad = sum_mean_grad2 / n - mag_grad**2
        norm_grad = np.sqrt(sum_mean_grad2)

        self.sufficient_stats = {
            "n": n,
            "sum": sum_mean_grad,
            "sq_sum": sum_mean_grad2,
            "var": var_grad,
            "mean": mean_grad,
            "mag": mag_grad,
            "norm": norm_grad
        }

    def train_desired_samples(self, desired_max_samples=None, apply_privacy_metrics=False, algo_payload = None):
        """Triggers training step.

        Args:
            desired_max_samples (int): number of samples that you would like to process.
            apply_privacy_metrics (bool): whether to save the batches used for the round for privacy metrics evaluation.

        Returns:
            2-tuple of (float, int): total training loss and number of processed samples.
        """

        num_samples = 0
        total_train_loss = 0
        algo_computation = None

        if algo_payload == None:
            num_samples_per_epoch, train_loss_per_epoch = self.run_train_epoch(desired_max_samples, apply_privacy_metrics)
        elif algo_payload['strategy'] == 'FedLabels':
            num_samples_per_epoch, train_loss_per_epoch, algo_computation = self.run_train_epoch_sup(desired_max_samples, apply_privacy_metrics, algo_payload)
        elif algo_payload['strategy'] == 'FedProx':
            num_samples_per_epoch, train_loss_per_epoch = self.run_train_epoch_fedprox(desired_max_samples, apply_privacy_metrics, algo_payload)

        num_samples += num_samples_per_epoch
        total_train_loss += train_loss_per_epoch

        return total_train_loss, num_samples, algo_computation

    def run_train_epoch(self, desired_max_samples=None, apply_privacy_metrics=False):
        """Implementation example for training the model.

        The training process should stop after the desired number of samples is processed.

        Args:
            desired_max_samples (int): number of samples that you would like to process.
            apply_privacy_metrics (bool): whether to save the batches used for the round for privacy metrics evaluation.

        Returns:
            2-tuple of (int, float): number of processed samples and total training loss.
        """

        sum_train_loss = 0.0
        num_samples = 0
        self.reset_gradient_power()

        # Reset gradient just in case
        self.model.zero_grad()

        train_loader = self.train_dataloader.create_loader()
        for batch in train_loader:
            if desired_max_samples is not None and num_samples >= desired_max_samples:
                break

            # Compute loss
            if self.optimizer is not None:
                self.optimizer.zero_grad()

            if self.ignore_subtask is True:
                loss = self.model.single_task_loss(batch)
            else:
                if apply_privacy_metrics:
                    if "x" in batch:
                        indices = to_device(batch["x"])
                    elif "input_ids" in batch:
                        indices = to_device(batch["input_ids"])
                    self.cached_batches.append(indices)
                loss = self.model.loss(batch)
            loss.backward()

            # Apply gradient clipping
            if self.max_grad_norm is not None:
                grad_norm = nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)

            # Sum up the gradient power
            self.estimate_sufficient_stats()

            # Now that the gradients have been scaled, we can apply them
            if self.optimizer is not None:
                self.optimizer.step()

            print_rank("step: {}, loss: {}".format(self.step, loss.item()), loglevel=logging.DEBUG)

            # Post-processing in this loop
            # Sum up the loss
            sum_train_loss += loss.item()

            # Increment the number of frames processed already
            if "attention_mask" in batch:
                num_samples += torch.sum(batch["attention_mask"].detach().cpu() == 1).item()
            elif "total_frames" in batch:
                num_samples += batch["total_frames"]
            else:
                num_samples += len(batch["x"])

            # Update the counters
            self.step += 1

        # Take a step in lr_scheduler
        if self.lr_scheduler is not None:
            self.lr_scheduler.step()

        return num_samples, sum_train_loss
    
    def run_train_epoch_fedprox(self, desired_max_samples=None, apply_privacy_metrics=False, algo_payload=None):
        """Implementation example for training the model.

        The training process should stop after the desired number of samples is processed.

        Args:
            desired_max_samples (int): number of samples that you would like to process.
            apply_privacy_metrics (bool): whether to save the batches used for the round for privacy metrics evaluation.
            algo_payload (dict): hyperparameters needed to fine-tune FedProx algorithm.

        Returns:
            2-tuple of (int, float): number of processed samples and total training loss.
        """

        sum_train_loss = 0.0
        num_samples = 0
        self.reset_gradient_power()

        # Reset gradient just in case
        self.model.zero_grad()

        # FedProx parameters
        mu = algo_payload['mu']
        global_model = to_device(copy.deepcopy(self.model))
        global_weight_collector = list(global_model.parameters())

        train_loader = self.train_dataloader.create_loader()
        for batch in train_loader:
            if desired_max_samples is not None and num_samples >= desired_max_samples:
                break

            # Compute loss
            if self.optimizer is not None:
                self.optimizer.zero_grad()

            if self.ignore_subtask is True:
                loss = self.model.single_task_loss(batch)
            else:
                if apply_privacy_metrics:
                    if "x" in batch:
                        indices = to_device(batch["x"])
                    elif "input_ids" in batch:
                        indices = to_device(batch["input_ids"])
                    self.cached_batches.append(indices)
                loss = self.model.loss(batch)
            
            # Fedprox regularization term
            fed_prox_reg = 0.0
            for param_index, param in enumerate(self.model.parameters()):
                fed_prox_reg += ((mu / 2) * torch.norm((param - global_weight_collector[param_index]))**2)
                loss += fed_prox_reg
            loss.backward()

            # Apply gradient clipping
            if self.max_grad_norm is not None:
                grad_norm = nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)

            # Sum up the gradient power
            self.estimate_sufficient_stats()

            # Now that the gradients have been scaled, we can apply them
            if self.optimizer is not None:
                self.optimizer.step()

            print_rank("step: {}, loss: {}".format(self.step, loss.item()), loglevel=logging.DEBUG)

            # Post-processing in this loop
            # Sum up the loss
            sum_train_loss += loss.item()

            # Increment the number of frames processed already
            if "attention_mask" in batch:
                num_samples += torch.sum(batch["attention_mask"].detach().cpu() == 1).item()
            elif "total_frames" in batch:
                num_samples += batch["total_frames"]
            else:
                num_samples += len(batch["x"])

            # Update the counters
            self.step += 1

        # Take a step in lr_scheduler
        if self.lr_scheduler is not None:
            self.lr_scheduler.step()

        return num_samples, sum_train_loss
    
    def run_train_epoch_sup(self, desired_max_samples=None, apply_privacy_metrics=False, algo_payload=None):
        """Implementation example for training the model using semisupervision.

        Args:
            desired_max_samples (int): number of samples that you would like to process.
            apply_privacy_metrics (bool): whether to save the batches used for the round for privacy metrics evaluation.
            algo_payload (dict): datasets and configuration used during training for the FedLabels algorithm.

        Returns:
            3-tuple of (int, float, dict): number of processed samples, total training loss and unsupervised model state dict.
        """

        sum_train_loss = 0.0
        num_samples = 0
        round_ = algo_payload['iter']
        semisupervision_config = algo_payload['config']
        self.reset_gradient_power()

        # Reset gradient just in case
        self.model.zero_grad()

        KL_pointLoss = torch.nn.KLDivLoss(reduction="none", log_target=True)
        MSELoss = torch.nn.MSELoss()
        Softmax = torch.nn.LogSoftmax(dim=1)
        nolog_Softmax = torch.nn.Softmax(dim=1)
        initial_net = copy.deepcopy(self.model)
        loss_func = torch.nn.CrossEntropyLoss()

        # Create datasets
        normal_dataset, unsupdataset, unsupdataset_rand  = algo_payload['data'][0], algo_payload['data'][1], algo_payload['data'][2]
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.003, momentum=0)

        for i in range(int(semisupervision_config['train_ep'])):
            sup_train = DataLoader(normal_dataset, batch_size=64, shuffle=True)
            data_sup = iter(sup_train)
            (images, labels) = next(data_sup)
            self.model.zero_grad()
            labels = to_device(labels)
            log_probs = self.model(to_device(images))
            loss = loss_func(log_probs, labels)
            num_samples+= len(labels)
            sum_train_loss += loss.item()
            loss.backward()
            self.optimizer.step()

        self.estimate_sufficient_stats()
        self.step += 1 # Update the counters
        print_rank("step: {}, loss: {}".format(self.step, loss.item()), loglevel=logging.DEBUG)

        net = copy.deepcopy(initial_net)
        optimizer = torch.optim.SGD(net.parameters(), lr=semisupervision_config['eta'], momentum=0)
        total_est_labels = 0
        total_est_ratios = 0
        correct = 0

        if round_ >= semisupervision_config['burnout_round']:
            for _ in range(int(semisupervision_config['unsuptrain_ep'])):
                data_idx = random.sample(range(len(unsupdataset)), semisupervision_config['unl_bs']) 
                partitioned = torch.utils.data.Subset(unsupdataset, indices=data_idx)
                ldr_train = DataLoader(partitioned, batch_size=semisupervision_config['bs'], shuffle=False)

                (images, true_labels) = next(iter(ldr_train))
                images, true_labels = to_device(images), to_device(true_labels)

                initial_net.eval()
                self.model.eval()

                with torch.no_grad():
                    output_local = initial_net(images).detach()
                    output_server = self.model(images).detach()

                local_logits = nolog_Softmax(output_local/semisupervision_config['temp'])
                server_logits = nolog_Softmax(output_server / semisupervision_config['temp'])
                est_labels, est_idx, est_var, est_ratio = get_label_VAT(local_logits, server_logits, semisupervision_config['thre'], semisupervision_config['comp'])
                total_est_labels += len(est_labels)
                total_est_ratios += est_ratio/semisupervision_config['unsuptrain_ep']

                if len(est_labels) != 0:
                    partitioned_rand = torch.utils.data.Subset(unsupdataset_rand, indices=data_idx)
                    ldr_rand_train = DataLoader(partitioned_rand, batch_size=semisupervision_config['bs'], shuffle=False)
                    (rand_images, _) = next(iter(ldr_rand_train))
                    rand_images = to_device(rand_images)

                    correct += ((est_labels == true_labels[est_idx]).sum().item()) / (
                                len(est_idx) * semisupervision_config['unsuptrain_ep'])

                    lamb_consist = semisupervision_config['vat_consis']
                    net.train()

                    output = net(rand_images[est_idx]) if semisupervision_config['uda'] == 1 else net(images[est_idx])
                    output_norand = net(images[est_idx])

                    # Compute Losses, this should go inside model.py
                    unsup_loss = loss_func(output, est_labels)
                    kl_point_loss = KL_pointLoss(Softmax(output_norand / semisupervision_config['temp']), Softmax(output_server[est_idx]/semisupervision_config['temp']))
                    consist_loss = torch.tensor(0.0, requires_grad=True)
                    consist_tmp = torch.tensor(0.0)

                    for i in range(len(est_var)):
                        if torch.argmax(local_logits[est_idx[i]]) == torch.argmax(server_logits[est_idx[i]]):
                            dummy = kl_point_loss[i]*est_var[i]
                            consist_tmp += 1
                            consist_loss = consist_loss+ dummy.sum()

                    if consist_tmp != torch.tensor(0.0):
                        consist_loss = consist_loss/consist_tmp

                    l2_lambda = semisupervision_config['l2_lambda']
                    initial_net.eval()
                    reg_loss = torch.tensor(0., requires_grad=True)
                    for p, prev_param in zip(net.parameters(), initial_net.parameters()):
                        reg_loss = reg_loss + MSELoss(p, prev_param)

                    (semisupervision_config['unsup_lamb']*unsup_loss + lamb_consist*consist_loss+l2_lambda*reg_loss).backward(retain_graph=True)
                    optimizer.step()

        return total_est_labels, sum_train_loss/semisupervision_config['ensize'], net.state_dict()

    def get_model(self):
        return copy.deepcopy(self.model)

    def prepare_iteration(self, model=None):
        """Steps to run before iteration begins."""

        if model is not None:
            self.model.load_state_dict(model.state_dict())

            self.lr_scheduler = None
            if self.optimizer is None and self.server_replay_config is not None and \
                    "optimizer_config" in self.server_replay_config:
                print_rank("Creating server-side replay training optimizer", loglevel=logging.DEBUG)
                self.optimizer = make_optimizer(self.server_replay_config["optimizer_config"], self.model)

            if self.optimizer is not None and self.anneal_config is not None:
                print_rank("Creating server-side replay-training lr_scheduler", loglevel=logging.DEBUG)
                self.lr_scheduler = make_lr_scheduler(self.anneal_config, self.optimizer)

    def reset_optimizer(self, optimizer_state_dict, annealing_config=None):
        """Re-load optimizer."""

        assert self.optimizer is not None, "This trainer does not have an optimizer"

        # Load optimizer on state dict
        self.optimizer.load_state_dict(optimizer_state_dict)

        # Set learning rate scheduler
        self.lr_scheduler = None
        if annealing_config is not None:
            self.lr_scheduler = make_lr_scheduler(annealing_config, self.optimizer)

    def save(self, model_path, token=None, config=None):
        """Save model to disk."""

        save_model(
            model_path=model_path,
            config=config,
            model=self.model,
            optimizer=self.optimizer,
            lr_scheduler=self.lr_scheduler,
            ss_scheduler=self.ss_scheduler,
            token=token
        )

    def load(self, save_path, update_lr_scheduler, update_ss_scheduler):
        """Load model from disk.

        If save_path is given, load from there. If not, then resume training
        from current model dir.  If at any point the save_path is not present on
        the disk, it won't be loaded.
        """

        if os.path.isfile(save_path):
            print_rank("Loading checkpoint: {}".format(save_path))
            checkpoint = torch.load(save_path)
            self.model.load_state_dict(checkpoint["model_state_dict"])
            if self.optimizer is not None:
                self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

            anl_st_dict = checkpoint.get("lr_scheduler_state_dict")
            if anl_st_dict and self.lr_scheduler is not None and update_lr_scheduler is True:
                self.lr_scheduler.load_state_dict(anl_st_dict)

            sss_st_dict = checkpoint.get("ss_scheduler_state_dict")
            if sss_st_dict and self.ss_scheduler is not None and update_lr_scheduler is True:
                self.ss_scheduler.load_state_dict(sss_st_dict)


def run_validation_generic(model, val_dataloader):
    """Perform a validation step.

    Args:
        model (torch.nn.Module): model to be validated.
        val_dataloader (torch.data.utils.DataLoader): provides val data.

    Returns:
        Average validation loss.
    """

    print_rank("run_validation_generic", loglevel=logging.DEBUG)
    model.set_eval()
    print_rank("set_eval", loglevel=logging.DEBUG)

    # Initialize dataloader etc.
    val_loader = val_dataloader.create_loader()
    print_rank(
        f"created loader {val_loader.num_workers}, " + \
        f"users: {len(val_dataloader.dataset.user_list)} " + \
        f"examples: {sum(val_dataloader.dataset.num_samples)} " + \
        f"lendata: {len(val_loader)} ",
        loglevel=logging.DEBUG
    )

    print_rank(
        f"drop_last: {val_loader.drop_last} " + \
        f"len_sampler: {len(val_loader._index_sampler)}",
        loglevel=logging.DEBUG
    )

    print_rank("Loading metrics ...", logging.DEBUG)
    metrics_cl = Metrics()
    return metrics_cl.compute_metrics(dataloader=val_loader, model=model)

def set_component_wise_lr(model, optimizer_config, updatable_names):
    """Set zero learning rate for layers in order to freeze the update.

    Args:
        model (torch.nn.Module):
        optimizer_config (string):
        updatable_names (list): ["^dec_rnn", "^fc"]
    """

    def name_matched(name, updatable_names):
        for updatable_name in updatable_names:
            if re.match(updatable_name, name) is not None:
                return True

        return False

    # Set learning rate to zero in layers which name does not follow regex
    parameters = []
    for name, params in model.named_parameters():
        if name_matched(name, updatable_names) is True:
            print_rank("updating {} with lr = {}".format(name, optimizer_config["lr"]))
            parameters.append({"params": params, "lr":optimizer_config["lr"]})
        else:
            print_rank("freezing {}".format(name))
            parameters.append({"params": params, "lr": 0.0})

    return parameters

def save_model(model_path, config, model, optimizer, lr_scheduler, ss_scheduler, token=None):
    """Save a model as well as training information."""

    save_state = {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict() if optimizer is not None else None,
        "lr_scheduler_state_dict": lr_scheduler.state_dict() if lr_scheduler is not None else None
    }
    if ss_scheduler is not None:
        save_state["ss_scheduler_state_dict"] = ss_scheduler.state_dict()

    if token:  # just save as "best" and return
        save_path = os.path.join(model_path, "{}_model.tar".format(token))
    else:
        save_path = os.path.join(model_path, "model.tar")

    print_rank("Saving model to: {}".format(save_path))
    try_except_save(torch_save, state_or_model=save_state, save_path=save_path)

    # Write out the config to model_dir
    if config is not None:
        try_except_save(write_yaml, config=config,
                save_path=os.path.join(model_path, "config.yaml"))


================================================
FILE: doc/sphinx/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS    ?=
SPHINXBUILD   ?= sphinx-build
SOURCEDIR     = .
BUILDDIR      = _build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: doc/sphinx/advanced.rst
================================================
Advanced Topics
===============

Privacy
-------

Aggregation Options
-------------------


Optimizer Options
-----------------

================================================
FILE: doc/sphinx/class_reference.rst
================================================


Class Reference
===============

FLUTE Core
~~~~~~~~~~

core/server
-----------

.. automodule:: core.server
   :members:
   :special-members: __init__

core/client
-----------

.. automodule:: core.client
   :members:
   :special-members: __init__

core/federated
--------------

.. automodule:: core.federated
   :members:
   :special-members: __init__


core/config
-----------
.. automodule:: core.config
   :members:
   :special-members: __init__


================================================
FILE: doc/sphinx/conf.py
================================================
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))


# -- Project information -----------------------------------------------------

project = 'FLUTE'
copyright = '2021, Microsoft Research'
author = 'Microsoft Research'


# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    'sphinx.ext.autodoc'
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
#html_theme = 'alabaster'

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

import sphinx_rtd_theme

html_theme = 'sphinx_rtd_theme'

html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]

================================================
FILE: doc/sphinx/index.rst
================================================
.. FLUTE documentation master file, created by
   sphinx-quickstart on Sat Jun 19 09:15:36 2021.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

Welcome to FLUTE documentation!
===============================

.. toctree::
   :maxdepth: 2
   :caption: Contents:

   overview
   scenarios
   launch
   advanced
   reference
   class_reference

Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`


================================================
FILE: doc/sphinx/launch.rst
================================================
Launch FLUTE
================

Local run
------------

Install the requirements stated inside of requirements.txt. Ideally this sould be done inside of a virtual environment, for instance, using Anaconda.

.. code:: bash
    conda create -n FLUTE python==3.8
    pip install -r requirements.txt

FLUTE uses torch.distributed API as its main communication backbone, supporting three buil-in backends. For more information please refer to [Distributed Communication Package](https://pytorch.org/docs/stable/distributed.html). Therefore, we highly suggest to use NCCL backend for distributed GPU training and Gloo for distributed CPU training. There is no `setup.py` as FLUTE is not currently distributed as a package, but instead meant to run from the root of the repository.

After this initial setup you can use your data for launching a local run. However the following instructions will be adapted to run ``nlg_gru`` task. For running this example, you need to first download and preprocess the data. Instructions can be found `here`_.  Once the data is available you can run FLUTE from root as follows:

.. code:: bash

    python -m torch.distributed.run --nproc_per_node=3 e2e_trainer.py -dataPath ./testing/mockup -outputPath scratch  -config testing/configs/hello_world_local.yaml -task nlg_gru -backend nccl

.. _here: https://github.com/microsoft/msrflute/tree/main/testing

If the setup of the experiment has been done correctly, after the model initialization we would be able to see the clients being trained:

.. figure:: img/run.png
    :align: center
    :width: 800

    Local run for nlg_gru task.

AML Run 
------------

FLUTE has a native integration for job submissions with Azure ML, allowing users to use the built-in CLI or web interface for job/experiment tracking.

For running experiments on AzureML, the CLI can help. You should first install the CLI `install the CLI`_ (make sure you have v2) and `create a resource group and workspace`_. You can then create a compute cluster, type ``az ml compute create -h`` for more info. Afterwards, you should write a YAML file with instructions for the job; we provide a simple example below:

.. _install the CLI: https://docs.microsoft.com/en-us/azure/machine-learning/reference-azure-machine-learning-cli
.. _create a resource group and workspace: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace-cli?tabs=vnetpleconfigurationsv1cli%2Ccreatenewresources%2Cworkspaceupdatev1%2Cworkspacesynckeysv1%2Cworkspacedeletev1

.. code:: yaml

    experiment_name: basic_example
    description: Basic example of AML config for submitting FLUTE jobs
    code:
    local_path: .
    compute: azureml:Test
    environment:
    image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
    inputs:
    data:
        folder: azureml://datastores/data/paths/cifar
        mode: rw_mount
    command: >
    apt -y update &&
    apt -y install openmpi-bin libopenmpi-dev openssh-client &&
    python3 -m pip install --upgrade pip &&
    python3 -m pip install -r requirements.txt &&
    python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py
    -outputPath=./outputs
    -dataPath={inputs.data}
    -task=classif_cnn
    -config=./experiments/classif_cnn/config.yaml
    -backend=nccl


You should replace ``compute`` with the name of the one you created before, and adjust the path of the datastore containing the data. In the example above, we created a datastore called ``data`` and added to it a folder called ``cifar``, which contained the two HDF5 files. The command passed above will install dependencies and then launch a NCCL job with 4 threads, for the experiment defined in ``experiments/classif_cnn``. Details on how to run a job using the AzureML CLI are given in its `documentation`_ , but typically it suffices to set up the environment and type ``az ml job create -f <name-of-the-yaml-file>``. In the same page of the documentation, you can also find more info about how to set up the YAML file above, in case other changes are needed.

.. _documentation: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-train-cli

.. note:: The local_path above is relative to the location of the YAML file. Setting it to ``.`` assumes it is in the same folder as ``e2e_trainer.py``. 
    
.. note:: All files on this folder will be uploaded to Azure, including hidden folders such as ``.git``, make sure to remove large files and folders that are not needed.

After launching the experiment, you can follow it on AzureML Studio, which prints logs, plots metrics and makes the output easily available after the experiment is finished.


================================================
FILE: doc/sphinx/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

:end
popd


================================================
FILE: doc/sphinx/overview.rst
================================================
FLUTE Overview
============

FLUTE: Federated Learning Utilities and Tools for Experimentation is a high-performance open source platform that enables researchers and developers to perform rapid prototyping and offline simulations of novel federated learning algorithms at scale. 

An FLUTE job consists of one or more nodes (physical or virtual machines) executing a total of K workers that can become a Server or Client. 

.. figure:: img/client-server.png
    :align: center
    :width: 600
    
    FLUTE uses a distributed processing architecture backed by torch.distributed. 

Worker 0 acts as a central orchestrator, maintaining and distributing the central model to workers, and subsequently distributing client tasks to them. On each training round the orchestrator takes care of:
    
    * Dispatch the central model to the rest of the workers
    * Queues up client tasks for workers to execute. 
    
Workers receive client tasks (client training data and training config) and:

    * Execute SGD on the central model using their client's training data
    * Send model delta (pseudo-gradient) back to the orchestrator. 

Each worker>0 processes client tasks sequentially, consisting of data encoding and one or more batch updates to the central model (note the central model is reset to its original state for each client task). As each client task completes, the model delta, aka the pseudo-gradient is sent back to the orchestrator for federation into a new central model.

Execution runs for up to N training rounds.  In each round the orchestrator may sample a subset of clients, and may also randomly delay pseudo-gradient updates from some clients to future rounds. The orchestrator will also periodically distribute evaluation tasks to determine model quality on validation and test data.

.. note:: AzureML generally expects there will be one worker per GPU on each node.

Architecture
------------

FLUTE design is based on a central server architecture.

.. figure:: img/architecture.png
    :align: center
    :width: 500
    
    FLUTE logical workflow. 

The logical workflow performed is:

    1. Send and initial global model to clients.
    2. Train instances of the global model with locally available data on each client.
    3. Send training information to the Server (e.g. adapted models, logits, pseudo-gradients).
    4. Combine the returned information on the server to produce a new model.
    5. Optionally, update the logbal model with an additional server-side rehearsal step.
    6. Send the updated global model back to the clients.
    7. Repeat steps 2-6 after sampling a new subset of clients for the next training interation.


================================================
FILE: doc/sphinx/reference.rst
================================================
Option Reference
================

Command Line Arguments
----------------------

YAML Configuration
------------------

FLUTE yaml files consist of three main sections, and a few optional sections. The `model_config` specifies model architecture and pretrained model setup path. The `server_config` section defines server settings such as total training rounds, aggregation method, optimizer settings, learning rate schedule, and any server-side training data.  The `client_config` section specifies client optimizer settings and the client-side training data.

.. note:: Training data is loaded by the server and dispatched to the clients. The configuration settings for this data are specified in the `client_config`.


model_config
~~~~~~~~~~~~

server_config
~~~~~~~~~~~~~

client_config
~~~~~~~~~~~~~

Optional Sections
-----------------
In addition to the main sections, some optional sections may be specified to control privacy settings, specifically a `dp_config` section for differential privacy settings, and `privacy_metrics_config` for applying privacy metrics.


dp_config
~~~~~~~~~

privacy_metrics_config
~~~~~~~~~~~~~~~~~~~~~~

================================================
FILE: doc/sphinx/requirements.txt
================================================
sphinx_rtd_theme
jinja2==3.0.3


================================================
FILE: doc/sphinx/scenarios.rst
================================================
Adding New Scenarios
====================

Data Preparation
------------
FLUTE provides the abstract class `BaseDataset` inside ``core/dataset.py`` that can be used  to wrap
any dataset and make it compatible with the platform. The dataset should be able to access all the data, 
and store it in the attributes `user_list`, `user_data`, `num_samples` and  `user_data_labels` (optional). 
These attributes are required to have these exact names. The abstract method ``load_data ()`` should be 
used to instantiate/load the dataset and provide the training format required by FLUTE on-the-fly. 
Here is a sample data blob for language model training.

.. code:: json

    {
        "users": ["bert","elmo"],
        "user_data": {
            "bert": {"x": ["my name is Bert.", "I live with Ernie."]},
            "elmo": {"x": ["Big Bird is my friend."]}
        },
        "num_samples": [2, 1]
    }

The blob consists of three fields:

    * ``users``: indicates a unique id for each user in the training data.  Users are sampled uniformly to create client tasks during training. There could be many more users than client tasks per round or even over all client tasks over all rounds. 
    * ``num_samples`` : indicates the number of samples for each user, in the same order as ``users`` list.  That is, for any index ``i`` in ``range(len(data['users']))``: 
    * ``user_data``: contains user-indexed training data. Each user's data is a dictionary of the form ``{"x": [list of examples]}``.  

If labels are needed by the task, ``user_data_label`` will be required by FLUTE with the user-indexed labels. The format should be similar to ``user_data`` where each user's label is a dictionary of the form ``{"x": [list of labels]}`` as follows:

.. code:: json

    "user_data_label": {
        "bert": {"x": [ 0 , 1 ]},
        "elmo": {"x": [ 0 ]}
        }

.. note::

    Test and validation data is formatted similarly.

.. note::

    Test/validate data is dispatched to workers by partitioning on users. If your test data isn't user-partitioned, we recommend partitioning it uniformly using some dummy user ids.

Add the model to FLUTE
--------------

FLUTE requires the model declaration framed in PyTorch, which must inhereit from the `BaseModel` class defined in ``core/model.py``. The following methods should be overridden:

    * __init__: model definition
    * loss: computes the loss used for training rounds
    * inference: computes the metrics used during evaluation rounds

Please see the example provided below:

.. code:: python

    from core.model import BaseModel

    class CNN(BaseModel):
    '''This is a PyTorch model with some extra methods'''

    def __init__(self, model_config):
        super().__init__()
        self.net = Net()

    def loss(self, input: torch.Tensor) -> torch.Tensor:
        '''Performs forward step and computes the loss'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)
        return F.cross_entropy(output, labels.long())

    def inference(self, input):
        '''Performs forward step and computes metrics'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)

        n_samples = features.shape[0]
        accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item()
        f1 = f1_score(labels.cpu(), torch.argmax(output, dim=1).cpu(), average='micro')

        # NOTE: Only the keys 'output','acc' and 'batch_size' does not require 
        # extra fields as 'value' and 'higher is better'. FLUTE requires this 
        # format only for customized metrics.

        return {'output':output, 'acc': accuracy, 'batch_size': n_samples, \
                'f1_score': {'value':f1,'higher_is_better': True}} 

Once the model is ready, all mandatory files must be in a single folder inside ´{/experiments´. Please adjust your files with the following naming structure so FLUTE can be able to find all the scripts needed.

.. code-block:: bash

    task_name
        |---- dataloaders
              |---- dataloader.py
              |---- dataset.py
        |---- utils
              |---- utils.py (if needed)
        |---- model.py
        |---- config.yaml
        |---- README.txt

.. note:: In case you need to import a module that has not been considered in FLUTE, this can be added in requirements.txt 

.. note:: All files must contain only absolute imports, in order to avoid issues when running.

Implement new metrics
--------------

The metrics computed during the evaluation rounds are declared inside `inference()` in the model declaration. FLUTE requires this function to return a dictionary with at least `output`, `acc` and `batch_size` as follows:

    .. code:: bash
        
        { "output": loss, "acc": accuracy, "batch_size": batch_size}

In order to add a new metric, we just need to add the key inside the same dictionary with the following format:

    .. code:: bash
        
        { "output": loss, 
          "acc": accuracy, 
          "batch_size": batch_size, 
          "custom_metric_1": {"value": value1 ,'higher_is_better': True},
          "custom_metric_2": {"value": value2 ,'higher_is_better': False}}

Once the keys have been included in the returning dictionary from `inference()`, FLUTE will automatically recognize them during the test/val rounds.

.. note:: Only the keys `output`, `acc` and `batch_size` does not require a dictionary. 

Create the configuration file
---------------------------------

The configuration file will allow you to specify the setup in your experiment, such as the optimizer, learning rate, number of clients and so on. FLUTE requires the following 6 sections:

    * model_config: path an parameters (if needed) to initialize the model.
    * dp_config: differential privacy setup.
    * privacy_metrics_config: for cache data to compute additional metrics.
    * strategy: defines the federated optimizer.
    * server_config: determines all the server-side settings.
    * client_config: dictates the learning parameters for client-side model updates. 

The blob below indicates the basic parameters required by FLUTE to run an experiment:

.. code:: yaml 

    model_config:
        model_type: CNN                                    # Class name in model.py 
        model_folder: experiments/classif_cnn/model.py     # Relative path to the model declaration

    dp_config:
        enable_local_dp: false                             # DP disabled

    privacy_metrics_config:
        apply_metrics: false                               # Privacy metrics disabled

    strategy: DGA                                          # Federated optimizar (DGA or FedAvg)

    server_config:   
        wantRL: false                                      # Whether to use RL-based meta-optimizers
        resume_from_checkpoint: false                      # Restart from checkpoint if file exists
        do_profiling: false                                # Run profiler and compute runtime metrics
        optimizer_config:                                  # Optimizer used to update the global model
            type: sgd
            lr: 1.0
        annealing_config:                                  # Annealer for the learning rate
            type: step_lr
            step_interval: epoch
            gamma: 1.0
            step_size: 100
        val_freq: 50                                       # Validation rounds frequency
        rec_freq: 100                                      # Testing rounds frequency
        initial_val: true                                  # Enable initial validation round
        initial_rec: true                                  # Enable initial testing round
        max_iteration: 2000                                # Total of iteration rounds
        num_clients_per_iteration: 10                      # Clients per interation
        data_config:                                       # Information for the test/val dataloaders
            val:
                batch_size: 10000
                val_data: test_data.hdf5                   # Assign to null for data loaded on-the-fly
            test:
                batch_size: 10000
                test_data: test_data.hdf5                  # Assign to null for data loaded on-the-fly
        type: model_optimization                           # Server type (model_optimization is the only available for now)
        aggregate_median: softmax                          # How aggregations weights are computed
        initial_lr_client: 0.001                           # Learning rate used on optimizer
        lr_decay_factor: 1.0                               # Decay factor for LR
        weight_train_loss: train_loss                      # Determines how each client's weight is computed (e.g. grad_mean_loss, train_loss)
        best_model_criterion: f1_score                     # Determines the best model based on minimal loss, for checkpointing
        fall_back_to_best_model: false                     # If a model degrades, use the previous best model
        softmax_beta: 1.0                                  # Beta value to use for the softmax DGA

    client_config:
        do_profiling: false                                # Run profiling and compute runtime metrics
        ignore_subtask: false                              # Determines which model loss to use. In most cases just set to False.
        data_config:                                       # Information for the train dataloader
            train:
                batch_size: 4
                list_of_train_data: train_data.hdf5        # Assign to null for data loaded on-the-fly
                desired_max_samples: 50000
        optimizer_config:                                  # Optimizer used by the client
            type: sgd
            lr: 0.001                                      # This is overridden by `initial_lr_client`
            momentum: 0.9
        type: optimization                                 # The type of client (always set "optimization for now")

.. note:: Documented templates for all the options available in the configuration files are provided inside configs folder.


================================================
FILE: e2e_trainer.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

'''
This is the main script to run on each NCCL/GLOO thread. It will spawn either a
Server or Worker object -- the former is responsible for orchestrating and
aggregating models, where as the latter processes clients' data to generate
a new model. The Server lives on the very first thread, whereas remaining
threads contain each a diferent Worker.
'''

import argparse
import os
import shutil
import yaml
import logging
from psutil import virtual_memory

import torch
import torch.distributed as dist
from azureml.core import Run

from core import federated
from core.config import FLUTEConfig
from core.server import select_server
from core.client import Client
from experiments import make_model
from utils import (
    make_optimizer,
    init_logging,
    print_rank,
    find_pretrained_model
)
from utils.dataloaders_utils import (
    make_train_dataloader,
    get_dataset,
)
from core.evaluation import make_eval_clients

def log_run_properties(config: FLUTEConfig):
    """Log parameters on AzureML.
    
    Args:
        config (dict): config containing parameters to log.
    """

    properties = {}

    # Build properties dictionary
    mem = virtual_memory()
    properties["System memory (GB)"] = float(mem.total) / (1024**3)

    props = [
        ("server_config.num_clients_per_iteration", 0),
        ("server_config.max_iteration", 0),
        ("dp_config.eps", 0),
        ("dp_config.max_weight", 0),
        ("dp_config.min_weight", 0),
        ("server_config.optimizer_config.type", "sgd"),
        ("server_config.optimizer_config.lr", 1.0),
        ("server_config.optimizer_config.amsgrad", False),
        ("server_config.annealing_config.type", "step_lr"),
        ("server_config.annealing_config.step_interval", "epoch"),
        ("server_config.annealing_config.gamma", 1.0),
        ("server_config.annealing_config.step_size", 100),
    ]

    for (key, default) in props:
        properties[key] = config.lookup(key, default)

    # Log the properties dictionary into AzureML
    run = Run.get_context()
    for k in properties:
        run.log(k, properties[k])


def run_worker(model_path, config, task, data_path, local_rank, backend):
    """Spawn worker object that lives throughout NCCL/GLOO thread.
    
    Args:
        model_path (str): path to the pretrained model.
        config (dict): dictionary containing parameters.
        task (str): what task to solve, must be a folder of :code:`experiments`.
        data_path (str): path to data.
        local_rank (int): the rank of the NCCL/GLOO thread.
    """
    model_config = config["model_config"]
    server_config = config["server_config"]
    client_config = config["client_config"]

    # Backend initialization
    WORLD_RANK = federated.rank()
    LOCAL_RANK = federated.local_rank()
    print_rank(f"Backend: {backend}")
    dist.init_process_group(backend=backend, init_method=None, rank=WORLD_RANK, world_size=federated.size())

    # Assign NCCL thread to a specific GPU
    if torch.cuda.is_available():
        print_rank(f"Assigning worker to GPU {LOCAL_RANK}")
        device = torch.device("cuda:{}".format(LOCAL_RANK))
        torch.cuda.set_device(device)

    # Make the Model to distribute to workers
    model = make_model(model_config)

    # Get evaluation datasets
    val_dataset = get_dataset(data_path, config, task, mode="val", test_only=True)
    test_dataset = get_dataset(data_path, config, task, mode="test", test_only=True)
    
    # Create list of clients for test/val -- Server need the indexes and Worker the clients list
    val_clients = list(make_eval_clients(val_dataset, config))
    test_clients = list(make_eval_clients(test_dataset, config))

    # pre-cache the training data and capture the number of clients for sampling
    num_clients = Client.get_train_dataset(data_path, config, task)
    config["server_config"]["data_config"]["num_clients"] = num_clients

    # Instantiate the Server object on the first thread
    if WORLD_RANK == 0:

        single_worker = None
        if federated.size() == 1:
            # For a single-GPU/CPU execution using NCCL, Server and Worker are instantiated in the same GPU.
            single_worker = federated.Worker(model=model,
                                        data_path=data_path,
                                        do_profiling=client_config.get("do_profiling", False),
                                        val_clients=val_clients,
                                        test_clients=test_clients,
                                        val_dataset = val_dataset,
                                        test_dataset = test_dataset,
                                        config= config)
            single_worker.run()
        
        try:
            print_rank('Server data preparation')

            if 'train' in config['server_config']['data_config']:
                server_train_dataloader = make_train_dataloader(config['server_config']['data_config']['train'], data_path, task=task, clientx=None)
            else:
                server_train_dataloader = None

            idx_val_clients = list(range(len(val_clients))) # Generates indexes for val clients
            idx_test_clients = list(range(len(test_clients))) # Generates indexes for test clients

            print_rank("Prepared the dataloaders")

            # Create the optimizer on the server
            optimizer = make_optimizer(server_config["optimizer_config"], model)

            # Load a model that's already trained
            best_trained_model = find_pretrained_model(model_path, model_config)
            if best_trained_model is not None:
                model_state_dict = torch.load(best_trained_model,
                    map_location=None if torch.cuda.is_available() else torch.device("cpu"))
                model.load_state_dict(model_state_dict)

            server_type = server_config["type"]
            server_setup = select_server(server_type)  # Return the server class
            server = server_setup(
                num_clients=config['server_config']['data_config']["num_clients"],
                model=model,
                optimizer=optimizer,
                ss_scheduler=None,
                data_path=data_path,
                model_path=model_path,
                server_train_dataloader=server_train_dataloader,
                config=config,
                idx_val_clients=idx_val_clients,
                idx_test_clients=idx_test_clients,
                single_worker=single_worker,
            )
            log_run_properties(config)

        except Exception as e:
            # Be sure the other workers are shut down.
            server.terminate_workers()
            raise e

        print_rank("Launching server")
        server.run()

    else:
        # Instantiate client-processing Worker on remaining threads
        print_rank("Worker on node {}: process started".format(WORLD_RANK))
        worker = federated.Worker(
            model=model,
            data_path=data_path,
            do_profiling=client_config.get("do_profiling", False),
            val_clients=val_clients,
            test_clients=test_clients,
            val_dataset = val_dataset,
            test_dataset = test_dataset,
            config= config,
        )
        worker.run()


if __name__ == "__main__":
    # Parse command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("-config")
    parser.add_argument("-outputPath")
    parser.add_argument("-dataPath", default=None)
    parser.add_argument("-task", default=None, help="Define the task for the run")
    parser.add_argument("-backend", default=None, help="Define the communication protocol")
    parser.add_argument("-num_skip_decoding", default=-1, type=int, help="Skip decoding in unsupervised learning mode")
    parser.add_argument("--local_rank", default=-1, type=int)

    args = parser.parse_args()
    data_path = args.dataPath
    task = args.task
    local_rank = args.local_rank
    assert args.backend in ['nccl','gloo'], f"Backend {args.backend} not recognized, please select nccl or gloo"
    backend = args.backend

    # The mount point can also be retrieved from input_datasets of the run context
    if data_path is None:
        data_path = Run.get_context().input_datasets["input"]
    print("The data can be found here: ", data_path)

    # Update the model path for the sake of AzureML
    id = Run.get_context().id
    experiment_name = "-".join(id.split("-")[-4:-2])
    experiment_root = os.path.join(args.outputPath, experiment_name)
    os.makedirs(experiment_root, exist_ok=True)
    model_path = os.path.join(experiment_root, "models")
    log_path = os.path.join(experiment_root, "log")

    os.makedirs(model_path, exist_ok=True)
    os.makedirs(log_path, exist_ok=True)

    # Make a copy of the config file into the output folder, for future reference
    cfg_out = os.path.join(experiment_root, "FLUTE_config.yaml")
    if local_rank <= 0:
        shutil.copyfile(args.config, cfg_out)
    
    # Initialize logging
    init_logging(log_path, loglevel=logging.INFO)

    with open(args.config) as f:

        cfg_dict = yaml.safe_load(f)
        config = FLUTEConfig.from_dict(cfg_dict)
        config["data_path"] = data_path
        config["output_path"] = args.outputPath
        config["model_path"]= model_path
        config["experiment_name"] = experiment_name
        config["client_config"]["task"] = task
        config["server_config"]["task"] = task
        config.validate()

        # Instantiate either Server or Worker on the thread
        run_worker(model_path, config, task, data_path, local_rank, backend)


================================================
FILE: experiments/__init__.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
from utils import print_rank, print_cuda_stats, to_device
from importlib.machinery import SourceFileLoader

def make_model(model_config, dataloader_type=None, input_dim=-1, output_dim=-1):
    print('Preparing model .. Initializing')
    
    try:
        dir = "./"+ str(model_config["model_folder"])
        model_class = model_config["model_type"]
        loader = SourceFileLoader(model_class,dir).load_module()
        model_type = getattr(loader,model_class )
    except:
        raise ValueError("{} model not found, make sure to indicate the model path in the .yaml file".format(model_config["type"]))

    model = model_type(model_config)
    print(model)

    if not "weight_init" in model_config or model_config["weight_init"] == "default":
        print_rank("initialize model with default settings")
        pass
    elif model_config["weight_init"] == "xavier_normal":
        print_rank("initialize model with xavier_normal")
        for p in model.parameters():
            if p.dim() > 1: # weight
                torch.nn.init.xavier_normal_(p.data)
            elif p.dim() == 1: # bias
                p.data.zero_()
        for m in model.modules():
            if isinstance(m, (torch.nn.Embedding, torch.nn.LayerNorm, torch.nn.BatchNorm2d)):
                m.reset_parameters()
    else:
        return ValueError("{} not supported".format(model_config["weight_init"]))

    print_rank("trying to move the model to GPU")
    model = to_device(model)
    print_rank("model: {}".format(model))
    print_cuda_stats()

    return model


================================================
FILE: experiments/classif_cnn/.gitignore
================================================
utils/data
*.hdf5
*.json

================================================
FILE: experiments/classif_cnn/README.md
================================================
# Simple example of a CNN on CIFAR-10

Our objective here is to bring a simple experiment from the Pytorch tutorials,
more specifically the one in https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py,
and convert it to FLUTE. Instructions on how to do this are given below.

An adapted version of the tutorial above is provided in the
`utils/centralized_training.py` script.

## Preparing the data

In this experiment we are making use of the CIFAR10 Dataset from torchvision, 
initializated in `dataloaders/cifar_dataset.py`, which inhereits from the
FLUTE base dataset class `core/dataset.py`

## Specifying the model

Next, we prepare the model. The `model.py` file contains two classes: one is the
`Net` class already contained in the original script, and the other, a class
called `CNN` which effectively wraps `Net`. Importantly, the `CNN` class defines
two methods: `loss` and `inference`; both perform forward steps and then perform
additional computations, in particular, the former executes the loss' evaluation,
and the latter the metrics' computation. The format of the inputs and outputs
should be the same as in this example.

## Specifying dataset and dataloaders

Inside the `dataloaders` folder, there are two files: `dataset.py` and
`dataloader.py`. Both inherit from the base classes declared in `core`
folder, that under the hood inhereit from Pytorch classes with same name.

The dataset should be able to access all the data, and store it in the
attributes `user_list`, `user_data`, `user_data_labels` and `num_samples` (user
names, user features, user labels if the problem is supervised, and number of
samples for each user, respectively). These attributes are required to have
these exact names. Otherwise, it should also be able to access the examples of a
specific user, which id is passed during initialization via the `user_idx`
argument.

The dataloader is simpler, and essentially just instantiates the dataset and
creates batches with a specific format.

## Creating a config file

All the parameters of the experiment are passed in a YAML file. A documented
example is provided in `config.yaml`.

## Running the experiment

Finally, to launch the experiment, it suffices to launch the `e2e_trainer.py`
script using torch.distributed.

```
python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -dataPath experiments/classif_cnn/utils/data -outputPath scratch -config experiments/classif_cnn/config.yaml -task classif_cnn -backend gloo
```

The `dataPath`, `outputPath` and `config` arguments should just specify the
respective files or folders, as in the example above -- in this case, a folder
called `scratch` will be created containing logs and checkpoints. The task
should be the name of the folder insider `experiments`.

Following what is specified in the config file, the experiment will run for
2000 rounds, and during each of them 10 clients will be selected at random,
each of whom has 50 samples. It is more or less the same, then, as the 2
epochs in the centralized training, except that clients are selected at
random so we might not see all of them.

================================================
FILE: experiments/classif_cnn/config.yaml
================================================
# Basic configuration file for running classif_cnn example using torchvision CIFAR10 dataset.
# Parameters needed to initialize the model
model_config:
    model_type: CNN                                    # class w/ `loss` and `inference` methods
    model_folder: experiments/classif_cnn/model.py     # file containing class

# Configuration for differential privacy
dp_config:
    enable_local_dp: false                             # whether to enable user-level DP

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false                               # cache data to compute additional metrics

# Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx)
strategy: DGA

# Determines all the server-side settings for training and evaluation rounds
server_config:   
    wantRL: false                                      # whether to use RL-based meta-optimizers
    resume_from_checkpoint: false                      # restart from checkpoint if file exists
    do_profiling: false                                # run profiler and compute runtime metrics
    optimizer_config:                                  # this is the optimizer used to update the model
        type: sgd
        lr: 1.0
    annealing_config:                                  # annealer for the learning rate
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 50                                       # how many iterations between metric eval on val set
    rec_freq: 100                                      # how many iterations between metric eval on test set
    initial_val: true
    initial_rec: true
    max_iteration: 2000                                # how many iterations in total
    num_clients_per_iteration: 10                      # how many clients per iteration
    data_config:                                       # where to get val and test data from
        val:
            batch_size: 10000
            val_data: null                             # Assigned to null because dataset is being instantiated
        test:
            batch_size: 10000
            test_data: null                            # Assigned to null because dataset is being instantiated
    type: model_optimization
    aggregate_median: softmax                          # how aggregations weights are computed
    initial_lr_client: 0.001                           # learning rate used on client optimizer
    lr_decay_factor: 1.0
    weight_train_loss: train_loss
    best_model_criterion: f1_score
    fall_back_to_best_model: false
    softmax_beta: 1.0

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    do_profiling: false                                # run profiling and compute runtime metrics
    ignore_subtask: false
    data_config:                                       # where to get training data from
        train:
            batch_size: 4
            list_of_train_data: null                   # Assigned to null because dataset is being instantiated
            desired_max_samples: 50000
    optimizer_config:                                  # this is the optimizer used by the client
        type: sgd
        lr: 0.001                                      # this is overridden by `initial_lr_client`
        momentum: 0.9
    type: optimization

================================================
FILE: experiments/classif_cnn/dataloaders/cifar_dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import time
import torchvision
import torchvision.transforms as transforms

class CIFAR10:
    def __init__(self) :
        # Get training and testing data from torchvision
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ])

        trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                download=True, transform=transform)
        testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                download=True, transform=transform)

        print('Processing training set...')
        self.trainset=_process(trainset, n_users=1000)

        print('Processing test set...')
        self.testset=_process(testset, n_users=200)

def _process(dataset, n_users):
    '''Process a Torchvision dataset to expected format and save to disk'''

    # Split training data equally among all users
    total_samples = len(dataset)
    samples_per_user = total_samples // n_users
    assert total_samples % n_users == 0

    # Function for getting a given user's data indices
    user_idxs = lambda user_id: slice(user_id * samples_per_user, (user_id + 1) * samples_per_user)

    # Convert training data to expected format
    print('Converting data to expected format...')
    start_time = time.time()

    data_dict = {  # the data is expected to have this format
        'users' : [f'{user_id:04d}' for user_id in range(n_users)],
        'num_samples' : 10000 * [samples_per_user],
        'user_data' : {f'{user_id:04d}': dataset.data[user_idxs(user_id)].tolist() for user_id in range(n_users)},
        'user_data_label': {f'{user_id:04d}': dataset.targets[user_idxs(user_id)] for user_id in range(n_users)},
    }

    print(f'Finished converting data in {time.time() - start_time:.2f}s.')

    return data_dict


================================================
FILE: experiments/classif_cnn/dataloaders/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch

from core.dataloader import BaseDataLoader
from experiments.classif_cnn.dataloaders.dataset import Dataset

class DataLoader(BaseDataLoader):
    def __init__(self, mode, num_workers=0, **kwargs):
        args = kwargs['args']
        self.batch_size = args['batch_size']

        dataset = Dataset(
            data=kwargs['data'],
            test_only=(not mode=='train'),
            user_idx=kwargs.get('user_idx', None),
        )

        super().__init__(
            dataset,
            batch_size=self.batch_size,
            shuffle=(mode=='train'),
            num_workers=num_workers,
            collate_fn=self.collate_fn,
        )

    def collate_fn(self, batch):
        x, y = list(zip(*batch))
        return {'x': torch.tensor(x), 'y': torch.tensor(y)}

================================================
FILE: experiments/classif_cnn/dataloaders/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
from core.dataset import BaseDataset
from experiments.classif_cnn.dataloaders.cifar_dataset import CIFAR10

class Dataset(BaseDataset):
    def __init__(self, data, test_only=False, user_idx=0, **kwargs):
        self.test_only = test_only
        self.user_idx = user_idx

        # Get all data
        self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only)

        if self.test_only:  # combine all data into single array
            self.user = 'test_only'
            self.features = np.vstack([user_data for user_data in self.user_data.values()])
            self.labels = np.hstack([user_label for user_label in self.user_data_label.values()])
        else:  # get a single user's data
            if user_idx is None:
                raise ValueError('in train mode, user_idx must be specified')

            self.user = self.user_list[user_idx]
            self.features = self.user_data[self.user]
            self.labels = self.user_data_label[self.user]

    def __getitem__(self, idx):
        return np.array(self.features[idx]).astype(np.float32).T, self.labels[idx]

    def __len__(self):
        return len(self.features)

    def load_data(self, data, test_only):
        '''Wrapper method to read/instantiate the dataset'''

        if data == None:
            dataset = CIFAR10()
            data = dataset.testset if test_only else dataset.trainset
        
        users = data['users']
        features = data['user_data']
        labels = data['user_data_label']
        num_samples = data['num_samples']
            
        return users, features, labels, num_samples

================================================
FILE: experiments/classif_cnn/model.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
from torch import nn
from torch.nn import functional as F
from sklearn.metrics import f1_score

from core.model import BaseModel

class Net(nn.Module):
    '''The standard PyTorch model we want to federate'''

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


class CNN(BaseModel):
    '''This is a PyTorch model with some extra methods'''

    def __init__(self, model_config):
        super().__init__()
        self.net = Net()

    def loss(self, input: torch.Tensor) -> torch.Tensor:
        '''Performs forward step and computes the loss'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)
        return F.cross_entropy(output, labels.long())

    def inference(self, input):
        '''Performs forward step and computes metrics'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)

        n_samples = features.shape[0]
        accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item()
        f1 = f1_score(labels.cpu(), torch.argmax(output, dim=1).cpu(), average='micro')

        # NOTE: Only the keys 'output','acc' and 'batch_size' does not require 
        # extra fields as 'value' and 'higher is better'. FLUTE requires this 
        # format only for customized metrics.

        return {'output':output, 'acc': accuracy, 'batch_size': n_samples, \
                'f1_score': {'value':f1,'higher_is_better': True}} 


================================================
FILE: experiments/classif_cnn/utils/centralized_training.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

'''Simple example of a CNN on CIFAR-10

This is adapted from the Pytorch tutorials. See
https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py
for more info.
'''

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


# Parameters
BATCH_SIZE = 4
N_EPOCHS = 2

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Create dataloaders
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
        shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
        download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE,
        shuffle=False, num_workers=2)


# Define the model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# Instantiate model, loss and optimizer
net = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# Training loop
for epoch in range(N_EPOCHS):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # Get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

# Compute accuracy
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

================================================
FILE: experiments/classif_cnn/utils/download_and_convert_data.py
================================================
import h5py
import json
import time

import torchvision
import torchvision.transforms as transforms
import tqdm


def _dump_dict_to_hdf5(data_dict: dict, hdf5_file: h5py.File):
    '''Dump dict with expected structure to HDF5 file'''

    hdf5_file.create_dataset('users', data=data_dict['users'])
    hdf5_file.create_dataset('num_samples', data=data_dict['num_samples'])

    # Store actual data in groups
    user_data_group = hdf5_file.create_group('user_data')
    for user, user_data in tqdm.tqdm(data_dict['user_data'].items()):
        user_subgroup = user_data_group.create_group(user)
        user_subgroup.create_dataset('x', data=user_data) 

    user_data_label_group = hdf5_file.create_group('user_data_label')
    for user, user_data_label in tqdm.tqdm(data_dict['user_data_label'].items()):
        user_data_label_group.create_dataset(user, data=user_data_label) 

def _process_and_save_to_disk(dataset, n_users, file_format, output):
    '''Process a Torchvision dataset to expected format and save to disk'''

    # Split training data equally among all users
    total_samples = len(dataset)
    samples_per_user = total_samples // n_users
    assert total_samples % n_users == 0

    # Function for getting a given user's data indices
    user_idxs = lambda user_id: slice(user_id * samples_per_user, (user_id + 1) * samples_per_user)

    # Convert training data to expected format
    print('Converting data to expected format...')
    start_time = time.time()

    data_dict = {  # the data is expected to have this format
        'users' : [f'{user_id:04d}' for user_id in range(n_users)],
        'num_samples' : 10000 * [samples_per_user],
        'user_data' : {f'{user_id:04d}': dataset.data[user_idxs(user_id)].tolist() for user_id in range(n_users)},
        'user_data_label': {f'{user_id:04d}': dataset.targets[user_idxs(user_id)] for user_id in range(n_users)},
    }

    print(f'Finished converting data in {time.time() - start_time:.2f}s.')

    # Save training data to disk
    print('Saving data to disk...')
    start_time = time.time()

    if file_format == 'json':
        with open(output + '.json', 'w') as json_file:
            json.dump(data_dict, json_file)
    elif file_format == 'hdf5':
        with h5py.File(output + '.hdf5', 'w') as hdf5_file:
            _dump_dict_to_hdf5(data_dict=data_dict, hdf5_file=hdf5_file)
    else:
        raise ValueError('unknown format.')

    print(f'Finished saving data in {time.time() - start_time:.2f}s.')


# Get training and testing data from torchvision
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
        download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
        download=True, transform=transform)

print('Processing training set...')
_process_and_save_to_disk(trainset, n_users=1000, file_format='hdf5', output='./data/train_data')

print('Processing test set...')
_process_and_save_to_disk(testset, n_users=200, file_format='hdf5', output='./data/test_data')

================================================
FILE: experiments/cv/README.md
================================================
# Simple example of ResNet model using personalization

Our objective here is to bring a simple experiment of Computer Vision task,
and convert it to FLUTE using the personalization feature. Instructions on 
how to do this are given below.

## Preparing the data

In this experiment we are making use of the CIFAR10 Dataset from torchvision, 
initializated in `data.py`, which is wrapped by FLUTE Base Dataset.

## Specifying the model

Next, we prepare the model. The `model.py` file contains different classes than
can be used for this experiment. However, for this example we are using the
`ResNet` class . Importantly, the `ResNet` class inheeits from `Base Model` 
declared in `core/model.py` and defines two methods: `loss` and `inference`; 
both perform forward steps and then perform additional computations, in particular, 
the former executes the loss' evaluation, and the latter the metrics' computation. 
The format of the inputs and outputs should be the same as in this example.

## Specifying dataset and dataloaders

Inside the `dataloaders` folder, there are two files: `dataset.py` and
`dataloader.py`. Both inherit from the base classes declared in `core`
folder, that under the hood inhereit from Pytorch classes with same name.

The dataset should be able to access all the data, and store it in the
attributes `user_list`, `user_data`, `user_data_labels` and `num_samples` (user
names, user features, user labels if the problem is supervised, and number of
samples for each user, respectively). These attributes are required to have
these exact names. Otherwise, it should also be able to access the examples of a
specific user, which id is passed during initialization via the `user_idx`
argument.

The dataloader is simpler, and essentially just instantiates the dataset and
creates batches with a specific format.

## Creating a config file

All the parameters of the experiment are passed in a YAML file. A documented
example is provided in `config.yaml`.

## Running the experiment

Finally, to launch the experiment, it suffices to launch the `e2e_trainer.py`
script using torch.distributed.

```
python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py -dataPath ./ -outputPath scratch -config experiments/classif_cnn/config.yaml -task cv -backend gloo
```

The `dataPath`, `outputPath` and `config` arguments should just specify the
respective files or folders, as in the example above -- in this case, `dataPath` 
can be any path given that data is being downloaded on-the.fly. A folder
called `scratch` will be created containing logs and checkpoints. The task
should be the name of the folder insider `experiments`.


================================================
FILE: experiments/cv/config.yaml
================================================
model_config:
    model_type: resnet50 #vgg11                                  # class w/ `loss` and `inference` methods
    model_folder: experiments/cv/model.py              # file containing class
    num_classes: 10

dp_config:
    enable_local_dp: false                             # whether to enable user-level DP

privacy_metrics_config:
    apply_metrics: false                               # cache data to compute additional metrics

strategy: DGA                                          # Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx)

server_config:
    wantRL: false                                      # whether to use RL-based meta-optimizers
    resume_from_checkpoint: false                      # restart from checkpoint if file exists
    do_profiling: false                                # run profiler and compute runtime metrics
    save_to_disk: false                                # save the updated dataset in disk
    optimizer_config:                                  # this is the optimizer used to update the model
        type: adam
        lr: 0.001
    annealing_config:                                  # annealer for the learning rate
        type: step_lr
        step_interval: epoch
        gamma: 1.00
        step_size: 100
    val_freq: 1000                                       # how many iterations between metric eval on val set
    rec_freq: 5                                       # how many iterations between metric eval on test set
    initial_val: False
    initial_rec: True
    max_iteration: 1000                                # how many iterations in total
    num_clients_per_iteration: 10                      # how many clients per iteration
    total_num_clients: 100
    data_config:                                       # where to get val and test data from
        val:
            batch_size: 128
            val_data: null
        test:
            batch_size: 128
            test_data: null
    type: personalization                              # Options: personalization | model_optimization
    aggregate_median: softmax                          # how aggregations weights are computed
    softmax_beta: 20.0
    initial_lr_client: 1.0                             # learning rate used on client optimizer
    lr_decay_factor: 1.0
    weight_train_loss: train_loss
    best_model_criterion: loss
    fall_back_to_best_model: false

client_config:
    do_profiling: false                                # run profiling and compute runtime metrics
    ignore_subtask: false
    convex_model_interp: 0.75                          # This is specific to personalization server/client
    data_config:                                       # where to get training data from
        train:
            batch_size: 128
            list_of_train_data: null
            desired_max_samples: 50000
    optimizer_config:                                  # this is the optimizer used by the client
        type: sgd
        lr: 0.001                                        # this is overridden by `initial_lr_client`
    type: optimization

================================================
FILE: experiments/cv/data.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
import h5py
import json
import os

import torchvision
from torchvision import transforms
import numpy as np
from numpy.random import RandomState

from utils import print_rank

class DataPartitioner(object):
    """ Partitions a dataset into different chunks. """

    def __init__(self, data, sizes=None, rnd=0, alpha=0, num_c=10,
                 dataset=None, lab_distr=None, ratio=1, img_size=32, wantTrans=True):
        self.data = data
        self.dataset  = dataset
        self.total_num= len(sizes) if sizes is not None else len(lab_distr)
        self.img_size= img_size
        self.wantTrans= wantTrans

        if lab_distr is not None:
            self.partitions, self.dat_stat = self.__use_fixed_lab_distr__(data, lab_distr,
                                                                           ratio, rnd, num_c)
        else:
            self.partitions, self.ratio, self.dat_stat, self.endat_size = self.__getDirichletData__(data, sizes,
                                                                                                    alpha, num_c, rnd)


    def get_lab_distr(self):
        return self.dat_stat


    def return_partition(self, partition, flag='data', is_train_set=True):

        if flag != 'data':
            return [self.data[idx][1] for idx in self.partitions[partition]]
        mean = [x / 255 for x in [125.3, 123.0, 113.9]]
        std = [x / 255 for x in [63.0, 62.1, 66.7]]

        if self.wantTrans:
            dc = {'resize': 0.5 if is_train_set else None,
                  'pad': None,
                  'crop': None,
                  'flip': False,
                  'rotate': (-180+2*int(partition*180/self.total_num), -180+2*int((partition+1)*180/self.total_num)) if is_train_set else \
                            (-180+2*int(partition*180/self.total_num)+2, -180+2*int(partition*180/self.total_num)+2),
                  'normalize': [mean, std]}
        else:
            dc = {'resize': None,
                  'pad': None,
                  'crop': None,
                  'flip': False,
                  'rotate': None,
                  'normalize': [mean, std]}

        transform = get_transform(transform=dc,img_size=self.img_size)

        return {'x': [transform(self.data[idx][0]).tolist() for idx in self.partitions[partition]]}


    def __use_fixed_lab_distr__(self, data, lab_distr, ratio, rnd, num_c):
        n_nets = []
        idx_batch = []
        labelList = np.array(data.targets)
        rann = RandomState(rnd)

        # Find where all labels are
        label_dict={lab: np.where(labelList == lab)[0] for lab in range(num_c)}

        # Process the prefixed label distributions one by one
        for lab_indices in list(lab_distr.keys())[:-1]:
            net_dataidx_map = {}

            for lab, num in lab_distr[lab_indices].items():
                len_k = len(label_dict[lab])
                idx_k = label_dict[lab][:min(int(num*ratio), len_k)]
                label_dict[lab] = label_dict[lab][min(int(num*ratio), len_k):]
                if len(idx_k)>0:
                    net_dataidx_map[lab] = list(idx_k)
            n_nets.append(net_dataidx_map)

        net_dataidx_map = {}
        for lab, idx_k in label_dict.items():
            if len(idx_k)>0:
                net_dataidx_map[lab] = idx_k
        n_nets.append(net_dataidx_map)

        for i, lab_indices in enumerate(n_nets):
            idx_batch.append([item for sublist in lab_indices.values() for item in sublist])

        net_cls_counts = {}
        for net_i, dataidx in enumerate(idx_batch):
            unq, unq_cnt = np.unique(labelList[dataidx], return_counts=True)
            tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))}
            net_cls_counts[net_i] = tmp

        print_rank('Data statistics: %s' % str(net_cls_counts), loglevel=logging.DEBUG)

        if 0:
            count=0
            tot_count={i:0 for i in range(10)}
            for _, client in net_cls_counts.items():
                for lab, num in client.items():
                    tot_count[lab]+=num
                    count+=num
            print('Debugging:', tot_count, count)


        return idx_batch, net_cls_counts


    # Getting this function from FedML -- 02-17-22
    def __getDirichletData__(self, data, psizes, alpha, num_c, rnd):
        n_nets = len(psizes)
        K = num_c
        labelList = np.array(data.targets)
        min_size = 0
        N = len(labelList)
        rann = RandomState(rnd)

        net_dataidx_map = {}
        while min_size < K:
            idx_batch = [[] for _ in range(n_nets)]
            # for each class in the dataset
            for k in range(K):
                idx_k = np.where(labelList == k)[0]
                rann.shuffle(idx_k)
                proportions = rann.dirichlet(np.repeat(alpha, n_nets))
                ## Balance
                proportions = np.array([p * (len(idx_j) < N / n_nets) for p, idx_j in zip(proportions, idx_batch)])
                proportions = proportions / proportions.sum()
                proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1]
                idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))]
                min_size = min([len(idx_j) for idx_j in idx_batch])

        for j in range(n_nets):
            rann.shuffle(idx_batch[j])
            net_dataidx_map[j] = idx_batch[j]

        net_cls_counts = {}
        for net_i, dataidx in net_dataidx_map.items():
            unq, unq_cnt = np.unique(labelList[dataidx], return_counts=True)
            tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))}
            net_cls_counts[net_i] = tmp

        local_sizes = []
        for i in range(n_nets):
            local_sizes.append(len(net_dataidx_map[i]))
        local_sizes = np.array(local_sizes)
        weights = local_sizes / np.sum(local_sizes)

        print_rank('Data statistics: %s' % str(net_cls_counts), loglevel=logging.DEBUG)
        print_rank('Data ratio: %s' % str(weights), loglevel=logging.DEBUG)


        if 0:
            count=0
            tot_count={i:0 for i in range(10)}
            for _, client in net_cls_counts.items():
                for lab, num in client.items():
                    tot_count[lab]+=num
                    count+=num
            print('Debugging:', tot_count, count)

        return idx_batch, weights, net_cls_counts, np.sum(local_sizes)


def partition_dataset(rnd, img_size, image, total_num_clients, image_path, alpha, wantTransform):

    partition_sizes = [1.0/total_num_clients for _ in range(total_num_clients)]

    if image == 'cifar':
        trainset = torchvision.datasets.CIFAR10(
                                            root=os.path.join(image_path, image),
                                            train=True,
                                            download=True,
                                            transform=None)
        train_partition = DataPartitioner(trainset, partition_sizes, rnd,
                                            alpha=alpha,
                                            num_c=10,
                                            img_size=img_size,
                                            wantTrans=wantTransform)

        testset = torchvision.datasets.CIFAR10(
                                            root=os.path.join(image_path, image),
                                            train=False,
                                            download=True,
                                            transform=None)

        if 0:
            lab_distr= train_partition.get_lab_distr()
            test_partition = DataPartitioner(testset, lab_distr=lab_distr, rnd=rnd, ratio=0.2,
                                                num_c=10,
                                                img_size=img_size,
                                                wantTrans=wantTransform)
        else:
            test_partition = DataPartitioner(testset, partition_sizes, rnd,
                                              alpha=alpha,
                                              num_c=10,
                                              img_size=img_size,
                                              wantTrans=wantTransform)

    elif image == 'cifar100':
        trainset = torchvision.datasets.CIFAR100(
                                            root=os.path.join(image_path, image),
                                            train=True,
                                            download=True,
                                            transform=transform_train) # NOTE: Is this working?
        train_partition = DataPartitioner(trainset, partition_sizes, rnd,
                                            alpha=alpha,
                                            num_c=100)

        testset = torchvision.datasets.CIFAR100(
                                            root=os.path.join(image_path, image),
                                            train=False,
                                            download=True,
                                            transform=transform_test)
        test_partition = DataPartitioner(testset, partition_sizes, rnd,
                                            alpha=alpha,
                                            num_c=100)

    return train_partition, test_partition


# Setup all necessary image datasets for training
def prepare_dataset(rnd=2020, img_size=40, image='cifar', total_num_clients=100, image_path="./", alpha= 1.0, wantTransform=False, save_to_disk=False):
    
    train_partition, test_partition = partition_dataset(rnd=rnd, 
                                                        img_size=img_size, 
                                                        image=image, 
                                                        total_num_clients=total_num_clients, 
                                                        image_path=image_path,
                                                        alpha=alpha,
                                                        wantTransform= wantTransform)

    datasets = ["train_dataset.hdf5", "test_dataset.hdf5"]
    print_rank('Processing {}... '.format(datasets), loglevel=logging.DEBUG)
    output = [_process_and_save_to_disk(train_partition if set == "train_dataset.hdf5" else test_partition, 
                                            save_to_disk, 
                                            file_format= set.split('.')[-1], 
                                            output=set, 
                                            is_train_set=True if set == "train_dataset.hdf5" else False) for set in datasets]

    return output[0], output[1]


def _dump_dict_to_hdf5(data_dict: dict, hdf5_file: h5py.File):
    '''Dump dict with expected structure to HDF5 file'''

    hdf5_file.create_dataset('users', data=data_dict['users'])
    hdf5_file.create_dataset('num_samples', data=data_dict['num_samples'])

    # Store actual data in groups
    user_data_group = hdf5_file.create_group('user_data')
    for user, user_data in data_dict['user_data']['x'].items():
        user_subgroup = user_data_group.create_group(user)
        user_subgroup.create_dataset('x', data=user_data)

    user_data_label_group = hdf5_file.create_group('user_data_label')
    for user, user_data_label in data_dict['user_data_label'].items():
        user_data_label_group.create_dataset(user, data=user_data_label)


def _process_and_save_to_disk(dataset, save_to_disk, file_format, output, is_train_set=True):
    '''Process a Torchvision dataset to expected format and save to disk'''

    n_users = len(dataset.partitions)

    # Convert training data to expected format
    print_rank('Converting data to expected format...', loglevel=logging.DEBUG)

    data_dict = {
        'users': [f'{user_id:04d}' for user_id in range(n_users)],
        'num_samples': [len(dataset.partitions[user_id]) for user_id in range(n_users)],
        'user_data': {f'{user_id:04d}': dataset.return_partition(user_id, 'data', is_train_set) for user_id in range(n_users)},
        'user_data_label': {f'{user_id:04d}': dataset.return_partition(user_id, 'labels', is_train_set) for user_id in range(n_users)},
    }

    # Save training data to disk
    print_rank('Saving data to disk...', loglevel=logging.DEBUG)
    if save_to_disk:
        if file_format == 'json':
            outfile =output + '.json'
            with open(outfile, 'w') as json_file:
                json.dump(data_dict, json_file)
        elif file_format == 'hdf5':
            outfile =output + '.hdf5'
            with h5py.File(outfile, 'w') as hdf5_file:
                _dump_dict_to_hdf5(data_dict=data_dict, hdf5_file=hdf5_file)
        else:
            raise ValueError('unknown format.')
        print_rank('Finished saving data...{}'.format(outfile), loglevel=logging.DEBUG)
    else:
        outfile=data_dict

    return outfile


def get_transform(transform, img_size=32):
    """Unpack transformations and apply to train or test splits"""

    transform_list = [transforms.ToTensor()]
    # resize
    if transform['resize'] is not None:
        transform_list.append(transforms.RandomResizedCrop(img_size, scale=(transform['resize'], 2*transform['resize'])))
        transform_list.append(torchvision.transforms.Pad(4))
    else:
        transform_list.append(transforms.RandomCrop(img_size, padding=4))
        #transform_list.append(transforms.Resize(img_size))
    # padding
    if transform['pad'] is not None:
        transform_list.append(transforms.Pad(transform['pad']))

    # crop
    if transform['crop'] is not None:
        transform_list.append(transforms.RandomResizedCrop(transform['crop']))

    if transform['rotate'] is not None:
        transform_list.append(transforms.RandomRotation(transform['rotate']))

    # flips
    if transform['flip']:
        transform_list.append(transforms.RandomHorizontalFlip())
        transform_list.append(transforms.RandomVerticalFlip())

    # normalization
    if transform['normalize'] is not None:
        transform_list.append(transforms.Normalize(mean=transform['normalize'][0], std=transform['normalize'][1]))

    return transforms.Compose(transform_list)


================================================
FILE: experiments/cv/dataloaders/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
import numpy as np

from core.dataloader import BaseDataLoader
from experiments.cv.dataloaders.dataset import Dataset

class DataLoader(BaseDataLoader):
    def __init__(self, mode, num_workers=0, **kwargs):
        args = kwargs['args']
        self.batch_size = args['batch_size']

        dataset = Dataset(
            data=kwargs['data'],
            test_only=(not mode=='train'),
            user_idx=kwargs.get('user_idx', 0),
        )

        super().__init__(
            dataset,
            batch_size=self.batch_size,
            shuffle=(mode=='train'),
            num_workers=num_workers,
            collate_fn=self.collate_fn,
        )

    def collate_fn(self, batch):
        x, y = list(zip(*batch))
        return {'x': torch.tensor(np.array(x)), 'y': torch.tensor(np.array(y)).long()}

================================================
FILE: experiments/cv/dataloaders/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np

from core.dataset import BaseDataset
from experiments.cv.data import prepare_dataset

class Dataset(BaseDataset):
    def __init__(self, data, test_only=False, user_idx=0, **kwargs):
        self.test_only = test_only
        self.user_idx = user_idx

        # Get all data
        self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only)

        if self.test_only:  # combine all data into single array
            self.user = 'test_only'
            self.features = np.vstack([user_data['x'] for user_data in self.user_data.values()])
            self.labels = np.hstack(list(self.user_data_label.values()))

        else:  # get a single user's data
            if user_idx is None:
                raise ValueError('in train mode, user_idx must be specified')

            self.user = self.user_list[user_idx]
            self.features = np.vstack([user_data['x'] for user_data in self.user_data.values()])
            self.labels = np.hstack(list(self.user_data_label.values()))

    def __getitem__(self, idx):
        return self.features[idx].astype(np.float32).T, self.labels[idx]

    def __len__(self):
        return len(self.features)

    def load_data(self, data, test_only):
        '''Download or load data from disk/memory.
        
        The `data` argument can be either the path to the JSON
        or HDF5 file that contains the expected dictionary, or the
        actual dictionary. In case data cannot be loaded, will be 
        downloaded through `prepare_dataset` method.'''

        if data == None:
            training_dataset, test_dataset = prepare_dataset(rnd=2020,
                                                                img_size=40, 
                                                                image='cifar', 
                                                                total_num_clients=100, 
                                                                image_path="./",
                                                                save_to_disk= False,
                                                                alpha= 1.0,
                                                                wantTransform= False)
            data = test_dataset if test_only else training_dataset
        
        users = data['users']
        features = data['user_data']
        labels = data['user_data_label']
        num_samples = data['num_samples']

        return users, features, labels, num_samples

================================================
FILE: experiments/cv/model.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
Modified from https://github.com/pytorch/vision.git

The torchvision package consists of popular datasets, model architectures, 
and common image transformations for computer vision.
'''
import torch as T
import torch.nn as nn
import numpy as np
import logging

logging.basicConfig(format='%(levelname)s - %(message)s', level=logging.DEBUG)

from torch import Tensor
from torch.utils.model_zoo import load_url as load_state_dict_from_url
from typing import Type, Any, Callable, Union, List, Optional

from core.model import BaseModel

__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
           'wide_resnet50_2', 'wide_resnet101_2']


model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-f37072fd.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-b627a593.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-0676ba61.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-63fe2227.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-394f9c45.pth',
    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
}


def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion: int = 1

    def __init__(
        self,
        inplanes: int,
        planes: int,
        stride: int = 1,
        downsample: Optional[nn.Module] = None,
        groups: int = 1,
        base_width: int = 64,
        dilation: int = 1,
        norm_layer: Optional[Callable[..., nn.Module]] = None
    ) -> None:
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride


    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
    # This variant is also known as ResNet V1.5 and improves accuracy according to
    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.

    expansion: int = 4

    def __init__(
        self,
        inplanes: int,
        planes: int,
        stride: int = 1,
        downsample: Optional[nn.Module] = None,
        groups: int = 1,
        base_width: int = 64,
        dilation: int = 1,
        norm_layer: Optional[Callable[..., nn.Module]] = None
    ) -> None:
        super(Bottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups

        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class ResNet(BaseModel):
    def __init__(
        self,
        block: Type[Union[BasicBlock, Bottleneck]],
        layers: List[int],
        num_class: int = 1000,
        zero_init_residual: bool = False,
        groups: int = 1,
        width_per_group: int = 64,
        replace_stride_with_dilation: Optional[List[bool]] = None,
        norm_layer: Optional[Callable[..., nn.Module]] = None
    ) -> None:
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_class)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]


    def _make_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
                    stride: int = 1, dilate: bool = False) -> nn.Sequential:
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def forward(self, inputs):
        inp = inputs['x'].cuda() if T.cuda.is_available() else inputs['x']
        x = self.conv1(T.transpose(inp, 1, 3))
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = T.flatten(x, 1)
        x = self.fc(x)

        return x


    def get_logit(self, x = None, evalis = True, logmax=False):
        data, target = x
        if logmax:
            Softmax = T.nn.LogSoftmax(dim=1)
        else:
            Softmax = T.nn.Softmax(dim=1)

        data = data.cuda() if T.cuda.is_available() else data

        if evalis:
            self.eval()
            with T.no_grad():
                # Run the forward pass
                output = self.forward(data)
                logits = Softmax(output)
                logits.detach_()

        else:
            self.train()
            output = self.forward(data)
            logits = Softmax(output)
        loss = 1

        return logits.cpu(), target.cpu(), loss


    def inference(self, inputs):
        targets = inputs['y'].cuda() if T.cuda.is_available() else inputs['y']

        # Run the forward pass
        self.eval()
        output = self(inputs)
        output = T.nn.LogSoftmax(dim=1)(output)

        # accuracy
        accuracy = T.mean((T.argmax(output, dim=1) == targets).float()).item()

        output = {'probabilities': output.cpu().detach().numpy(),
                      'predictions': np.arange(0, targets.shape[0]),
                      'labels': targets.cpu().numpy()}

        return {'output':output, 'acc': accuracy, 'batch_size': targets.shape[0]}

    def loss(self, inputs):
        targets = inputs['y'].cuda() if T.cuda.is_available() else inputs['y']

        # Run the forward pass
        self.train()
        output = self.forward(inputs)
        loss = T.nn.functional.cross_entropy(output, targets)

        return loss


    def copy_state_dict(self, state_dict):
        self.state_dict=state_dict.clone()


    def get_model(self):
        return self


def _resnet(
    arch: str,
    block: Type[Union[BasicBlock, Bottleneck]],
    layers: List[int],
    pretrained: bool,
    progress: bool,
    **kwargs: Any
) -> ResNet:
    model = ResNet(block, layers, **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        # edit last layer
        state_dict['fc.weight'] = state_dict['fc.weight'][:kwargs['num_class']]
        state_dict['fc.bias'] = state_dict['fc.bias'][:kwargs['num_class']]
        model.load_state_dict(state_dict)
    return model


def resnet18(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
    r"""ResNet-18 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['num_class']= config['num_classes']
    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
                   **kwargs)


def resnet34(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
    r"""ResNet-34 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['num_class']= config['num_classes']
    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
                   **kwargs)


def resnet50(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
    r"""ResNet-50 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['num_class']= config['num_classes']
    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
                   **kwargs)


def resnet101(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
    r"""ResNet-101 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
                   **kwargs)


def resnet152(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
    r"""ResNet-152 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['num_class']= config['num_classes']
    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
                   **kwargs)


def resnext50_32x4d(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
    r"""ResNeXt-50 32x4d model from
    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['groups'] = 32
    kwargs['width_per_group'] = 4
    kwargs['num_class']= config['num_classes']
    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
                   pretrained, progress, **kwargs)


def resnext101_32x8d(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
    r"""ResNeXt-101 32x8d model from
    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['groups'] = 32
    kwargs['width_per_group'] = 8
    kwargs['num_class']= config['num_classes']
    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
                   pretrained, progress, **kwargs)


def wide_resnet50_2(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
    r"""Wide ResNet-50-2 model from
    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.

    The model is the same as ResNet except for the bottleneck number of channels
    which is twice larger in every block. The number of channels in outer 1x1
    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
    channels, and in Wide ResNet-50-2 has 2048-1024-2048.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['width_per_group'] = 64 * 2
    kwargs['num_class']= config['num_classes']
    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
                   pretrained, progress, **kwargs)


def wide_resnet101_2(config, pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
    r"""Wide ResNet-101-2 model from
    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.

    The model is the same as ResNet except for the bottleneck number of channels
    which is twice larger in every block. The number of channels in outer 1x1
    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
    channels, and in Wide ResNet-50-2 has 2048-1024-2048.

    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    kwargs['width_per_group'] = 64 * 2
    kwargs['num_class']= config['num_classes']
    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
                   pretrained, progress, **kwargs)


================================================
FILE: experiments/cv/model_vgg.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

'''
Modified from https://github.com/pytorch/vision.git

The torchvision package consists of popular datasets, model architectures, 
and common image transformations for computer vision.
'''
import math
import torch as T
import torch.nn as nn
import numpy as np
import logging

logging.basicConfig(format='%(levelname)s - %(message)s', level=logging.DEBUG)

__all__ = [
    'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
    'vgg19_bn', 'vgg19',
]

class VGG(nn.Module):
    '''
    VGG model
    '''
    def __init__(self, vgg, num_class, topK_results=None):
        super(VGG, self).__init__()

        self.topK_results = num_class if topK_results is None else topK_results
        self.vgg = vgg
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(512, 512),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(512, 512),
            nn.ReLU(True),
            nn.Linear(512, num_class),
        )
        if 0:
            # Initialize weights
            for m in self.modules():
                if isinstance(m, nn.Conv2d):
                    n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                    m.weight.data.normal_(0, math.sqrt(2. / n))
                    m.bias.data.zero_()

    def forward(self, inputs):
        inputs = inputs['x'].cuda() if T.cuda.is_available() else inputs['x']
        x = self.vgg(inputs.view(-1,3,32,32))
        x = T.flatten(x, 1)
        x = self.classifier(x)
        return x


    def loss(self, inputs):
        targets = inputs['y'].cuda() if T.cuda.is_available() else inputs['y']
        # Run the forward pass
        output = self(inputs)
        loss = T.nn.functional.cross_entropy(output, targets)

        return loss


    def inference(self, inputs):
        targets = inputs['y'].cuda() if T.cuda.is_available() else inputs['y']

        # Run the forward pass
        output = self(inputs)

        # accuracy
        accuracy = T.mean((T.argmax(output, dim=1) == targets).float()).item()

        output = {'probabilities': output.cpu().detach().numpy(),
                      'predictions': np.arange(0, targets.shape[0]),
                      'labels': targets.cpu().numpy()}

        return {'output':output, 'val_acc': accuracy, 'batch_size': targets.shape[0]}

    def get_logit(self, inputs = None, evalis = True, logmax=False):
        data, targets = inputs

        if logmax:
            Softmax = T.nn.LogSoftmax(dim=1)
        else:
            Softmax = T.nn.Softmax(dim=1)

        data = data.cuda() if T.cuda.is_available() else data

        if evalis:
            self.eval()
            with T.no_grad():
                # Run the forward pass
                output = self.forward(data)
                logits = Softmax(output)
        else:
            self.train()
            output = self.forward(data)
            logits = Softmax(output)

        loss = T.nn.functional.cross_entropy(output, targets)

        return logits.cpu(), targets.cpu(), loss.cpu()

    def copy_state_dict(self, state_dict):
        self.state_dict=state_dict.clone()

    def set_eval(self):
        """
        Bring the model into evaluation mode
        """
        self.eval()

    def set_train(self):
        """
        Bring the model into train mode
        """
        self.train()


def make_layers(cfg, n_channels=3, batch_norm=True):
    layers = []
    in_channels = n_channels
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)


cfg = {
    'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M',
          512, 512, 512, 512, 'M'],
}


def vgg11(config):
    """VGG 11-layer model (configuration "A")"""
    num_class = config['num_classes']
    return VGG(make_layers(cfg['A'], batch_norm=False),num_class)


def vgg11_bn(config):
    """VGG 11-layer model (configuration "A") with batch normalization"""
    num_class = config['num_classes']
    return VGG(make_layers(cfg['A'], batch_norm=True),num_class)


def vgg13(config):
    """VGG 13-layer model (configuration "B")"""
    num_class = config['num_classes']
    return VGG(make_layers(cfg['B'], batch_norm=False),num_class)


def vgg13_bn(config):
    """VGG 13-layer model (configuration "B") with batch normalization"""
    num_class=config['num_classes']
    return VGG(make_layers(cfg['B'], batch_norm=True),num_class)


def vgg16(config):
    """VGG 16-layer model (configuration "D")"""
    num_class = config['num_classes']
    return VGG(make_layers(cfg['D'], batch_norm=False),num_class)


def vgg16_bn(config):
    """VGG 16-layer model (configuration "D") with batch normalization"""
    num_class = config['num_classes']
    return VGG(make_layers(cfg['D'], batch_norm=True),num_class)


def vgg19(config):
    """VGG 19-layer model (configuration "E")"""
    num_class=config['num_classes']
    return VGG(make_layers(cfg['E'], batch_norm=False),num_class)


def vgg19_bn(config):
    """VGG 19-layer model (configuration 'E') with batch normalization"""
    num_class=config['num_classes']
    return VGG(make_layers(cfg['E'], batch_norm=True),num_class)

================================================
FILE: experiments/cv/server.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
In this file, we define the local server that lives inside the client.
'''

from core.server import OptimizationServer

class PersonalizationServer(OptimizationServer):
    def __init__(self, num_clients, model, optimizer, ss_scheduler, data_path, model_path, server_train_dataloader,
                 config, idx_val_clients, idx_test_clients):
        """
        Personalization Server. 
        
        Customized routines for server can be included here.
        """
        super().__init__(num_clients, model, optimizer, ss_scheduler, data_path, model_path, server_train_dataloader,
                 config, idx_val_clients, idx_test_clients)

================================================
FILE: experiments/cv_cnn_femnist/README.md
================================================
## FedML Benchmark

### Examples

The example in this folder was taken from [FedML](https://github.com/FedML-AI/FedML/tree/master/python/examples/simulation/mpi_fedavg_datasets_and_models_example) repository on its release 0.7.300, using the configuration suggested on their
[benchmarking results](https://doc.fedml.ai/simulation/benchmark/BENCHMARK_MPI.html) for MPI-Based Federated Learning (fastest on this version).

### Data

FLUTE will automatically download the data used for this example, otherwise you can use the scripts provided [here](https://github.com/FedML-AI/FedML/tree/master/python/fedml/data) for each independent dataset in the FedML GitHub repository. 

### Run

If you downloaded the data manually, make sure that the variable `data_cache_dir` has been updated inside `preprocess.py`. Later, you can run the experiment as follows:

```code
    python -m torch.distributed.run  --nproc_per_node=4  e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest  -config ./experiments/cv_cnn_femnist/config.yaml -task cv_cnn_femnist -backend nccl
```

### Results

This comparison was carried out using Parrot (Simulator) on version 0.7.303 at commit ID [8f7f261f](https://github.com/FedML-AI/FedML/tree/8f7f261f44e58d0cb5a416b0d6fa270b42a91049). 

```
 _____________________________________________________________________________
|                    |   FedML (MPI) - Fastest   |   FLUTE (NCCL)  - Fastest  |
| Task               | Acc | Time     | GPU Mem  | Acc | Time     | GPU Mem   |
|--------------------|-----|----------|----------|-----|----------|-----------|
| LR_MNIST           | ~81 | 00:03:09 | ~3060 MB | ~81 | 00:01:35 | ~1060 MB  |
| CNN_FEMNIST        | ~83 | 05:49:52 | ~5180 MB | ~83 | 00:08:22 | ~1770 MB  |
| RESNET_FEDCIFAR100 | ~34 | 15:55:36 | ~5530 MB | ~33 | 01:42:01 | ~1900 MB  |
| RNN_FEDSHAKESPEARE | ~57 | 06:46:21 | ~3690 MB | ~57 | 00:21:50 | ~1270 MB  |
 -----------------------------------------------------------------------------
```

### FedML Configuration file

In order to reproduce this experiment in FedML please use the setup below. 

```yaml

common_args:
  training_type: "simulation"
  random_seed: 0

data_args:
  dataset: "femnist"
  data_cache_dir: "~/fedml_data"
  partition_method: "hetero"
  partition_alpha: 0.5

model_args:
  model: "cnn"


train_args:
  federated_optimizer: "FedAvg"
  client_id_list: "[]"
  client_num_in_total: 3400
  client_num_per_round: 10
  comm_round: 800
  epochs: 1
  batch_size: 20
  client_optimizer: sgd
  learning_rate: 0.1
  weight_decay: 0.001

validation_args:
  frequency_of_the_test: 50

device_args:
  worker_num: 10
  using_gpu: true
  gpu_mapping_file: config/fedemnist_cnn/gpu_mapping.yaml
  gpu_mapping_key: mapping_default # [3, 3, 3, 2]

comm_args:
  backend: "MPI"
  is_mobile: 0

```

================================================
FILE: experiments/cv_cnn_femnist/config.yaml
================================================
# Basic configuration file for running classif_cnn example using torchvision CIFAR10 dataset.
# Parameters needed to initialize the model
model_config:
    model_type: CNN                                   # class w/ `loss` and `inference` methods
    model_folder: experiments/cv_cnn_femnist/model.py     # file containing class

# Configuration for differential privacy
dp_config:
    enable_local_dp: false                             # whether to enable user-level DP

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false                               # cache data to compute additional metrics

# Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx)
strategy: FedAvg

# Determines all the server-side settings for training and evaluation rounds
server_config:   
    wantRL: false                                      # whether to use RL-based meta-optimizers
    resume_from_checkpoint: false                      # restart from checkpoint if file exists
    do_profiling: false                                # run profiler and compute runtime metrics
    optimizer_config:                                  # this is the optimizer used to update the model
        type: sgd
        lr: 1.0
    annealing_config:                                  # annealer for the learning rate
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 50000                                  # not executing validation on this experiment, only testing
    rec_freq: 50                                     # how many iterations between metric eval on test set
    initial_val: false
    initial_rec: false
    max_iteration: 800                               # how many iterations in total
    num_clients_per_iteration: 10                      # how many clients per iteration
    data_config:                                       # where to get val and test data from
        val:
            batch_size: 20
            val_data: null                             # Assigned to null because dataset is being instantiated
        test:
            batch_size: 20
            test_data: null                            # Assigned to null because dataset is being instantiated
    type: model_optimization
    aggregate_median: softmax                          # how aggregations weights are computed
    initial_lr_client: 0.1                           # learning rate used on client optimizer
    lr_decay_factor: 1.0
    weight_train_loss: train_loss
    best_model_criterion: loss
    fall_back_to_best_model: false
    softmax_beta: 1.0

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    do_profiling: false                                # run profiling and compute runtime metrics
    ignore_subtask: false
    data_config:                                       # where to get training data from
        train:
            batch_size: 20
            list_of_train_data: null                   # Assigned to null because dataset is being instantiated
            desired_max_samples: 5000
    optimizer_config:                                  # this is the optimizer used by the client
        type: sgd
        lr: 0.1                                      # this is overridden by `initial_lr_client`
    type: optimization

================================================
FILE: experiments/cv_cnn_femnist/dataloaders/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
import numpy as np 

from core.dataloader import BaseDataLoader
from experiments.cv_cnn_femnist.dataloaders.dataset import Dataset

class DataLoader(BaseDataLoader):
    def __init__(self, mode, num_workers=0, **kwargs):
        args = kwargs['args']
        self.batch_size = args['batch_size']

        dataset = Dataset(
            data=kwargs['data'],
            test_only=(not mode=='train'),
            user_idx=kwargs.get('user_idx', None),
        )

        super().__init__(
            dataset,
            batch_size=self.batch_size,
            shuffle=(mode=='train'),
            num_workers=num_workers,
            collate_fn=self.collate_fn,
        )

    def collate_fn(self, batch):
        x, y = list(zip(*batch))
        x, y = np.array(x), np.array(y)
        return {'x': torch.tensor(x), 'y': torch.tensor(y)}

================================================
FILE: experiments/cv_cnn_femnist/dataloaders/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
from core.dataset import BaseDataset
from experiments.cv_cnn_femnist.dataloaders.preprocess import FEMNIST

class Dataset(BaseDataset):
    def __init__(self, data, test_only=False, user_idx=0, **kwargs):
        self.test_only = test_only
        self.user_idx = user_idx

        # Get all data
        self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only)

        if user_idx == -1:
            self.user = self.user_list
            self.features = np.vstack([user_data for user_data in self.user_data.values()])
            self.labels = np.hstack([user_label for user_label in self.user_data_label.values()])
        else:           
            if self.test_only:  # combine all data into single array
                self.user = 'test_only'
                self.features = np.vstack([user_data for user_data in self.user_data.values()])
                self.labels = np.hstack([user_label for user_label in self.user_data_label.values()])
            else:  # get a single user's data
                if user_idx is None:
                    raise ValueError('in train mode, user_idx must be specified')

                self.user = self.user_list[user_idx]
                self.features = self.user_data[self.user]
                self.labels = self.user_data_label[self.user]

    def __getitem__(self, idx):
        return np.array(self.features[idx]).astype(np.float32).T, self.labels[idx]

    def __len__(self):
        return len(self.features)

    def load_data(self, data, test_only):
        '''Wrapper method to read/instantiate the dataset'''

        if data == None:
            dataset = FEMNIST()
            data = dataset.testset if test_only else dataset.trainset
        
        users = data['users']
        features = data['user_data']
        labels = data['user_data_label']
        num_samples = data['num_samples']
            
        return users, features, labels, num_samples

================================================
FILE: experiments/cv_cnn_femnist/dataloaders/preprocess.py
================================================
import os
import h5py
import wget
import tarfile

data_cache_dir = "./data"
DEFAULT_TRAIN_FILE = "fed_emnist_train.h5"
DEFAULT_TEST_FILE = "fed_emnist_test.h5"

''' 
    The FederatedEMNIST dataset is taken from FedML repository. For more information regarding this dataset, 
    please refer to https://github.com/FedML-AI/FedML/tree/master/python/fedml/data/FederatedEMNIST.

    In order to download the data run the following commands:
        - wget --no-check-certificate --no-proxy https://fedml.s3-us-west-1.amazonaws.com/fed_emnist.tar.bz2
        - tar -xvf fed_emnist.tar.bz2
'''

class FEMNIST:
    def __init__(self) :
        
        download_files(data_cache_dir)

        # Preprocess the dataset
        train_h5 = h5py.File(os.path.join(data_cache_dir,'femnist', DEFAULT_TRAIN_FILE), "r")
        test_h5 = h5py.File(os.path.join(data_cache_dir, 'femnist',DEFAULT_TEST_FILE), "r")
        test_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()}
        train_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()}

        for user in test_h5['examples'].keys():
            test_dict['users'].append(user)
            test_dict['num_samples'].append(len(test_h5['examples'][user]['pixels'][()]))
            test_dict['user_data'][user] = test_h5['examples'][user]['pixels'][()]
            test_dict['user_data_label'][user] = test_h5['examples'][user]['label'][()]
            
        for user in train_h5['examples'].keys():
            train_dict['users'].append(user)
            train_dict['num_samples'].append(len(train_h5['examples'][user]['pixels'][()]))
            train_dict['user_data'][user] = train_h5['examples'][user]['pixels'][()]
            train_dict['user_data_label'][user] = train_h5['examples'][user]['label'][()]

        print(" Dictionaries ready .. ")
        self.trainset, self.testset = train_dict, test_dict

def download_files(data_cache_dir):

    URL = "https://fedml.s3-us-west-1.amazonaws.com/fed_emnist.tar.bz2"

    if not os.path.exists(data_cache_dir):
        os.makedirs(data_cache_dir)

    file_path = os.path.join(data_cache_dir,"fed_emnist.tar.bz2") 

    # Download and decompress the file (if we haven't already)
    if not os.path.exists(file_path):
        wget.download(URL, out=file_path)

        file = tarfile.open(file_path)
        file.extractall(os.path.join(data_cache_dir,'femnist'))
        file.close()


================================================
FILE: experiments/cv_cnn_femnist/model.py
================================================
import torch
from torch import nn
from torch.nn import functional as F
from core.model import BaseModel

''' 
    The CNN_DropOut model is taken from FedML repository. For more information regarding this model, 
    please refer to https://github.com/FedML-AI/FedML/blob/master/python/fedml/model/cv/cnn.py.

'''

class CNN_DropOut(torch.nn.Module):
    """
    Recommended model by "Adaptive Federated Optimization" (https://arxiv.org/pdf/2003.00295.pdf)
    Used for EMNIST experiments.
    When `only_digits=True`, the summary of returned model is
    ```
    Model:
    _________________________________________________________________
    Layer (type)                 Output Shape              Param #
    =================================================================
    reshape (Reshape)            (None, 28, 28, 1)         0
    _________________________________________________________________
    conv2d (Conv2D)              (None, 26, 26, 32)        320
    _________________________________________________________________
    conv2d_1 (Conv2D)            (None, 24, 24, 64)        18496
    _________________________________________________________________
    max_pooling2d (MaxPooling2D) (None, 12, 12, 64)        0
    _________________________________________________________________
    dropout (Dropout)            (None, 12, 12, 64)        0
    _________________________________________________________________
    flatten (Flatten)            (None, 9216)              0
    _________________________________________________________________
    dense (Dense)                (None, 128)               1179776
    _________________________________________________________________
    dropout_1 (Dropout)          (None, 128)               0
    _________________________________________________________________
    dense_1 (Dense)              (None, 10)                1290
    =================================================================
    Total params: 1,199,882
    Trainable params: 1,199,882
    Non-trainable params: 0
    ```
    Args:
      only_digits: If True, uses a final layer with 10 outputs, for use with the
        digits only MNIST dataset (http://yann.lecun.com/exdb/mnist/).
        If False, uses 62 outputs for Federated Extended MNIST (FEMNIST)
        EMNIST: Extending MNIST to handwritten letters: https://arxiv.org/abs/1702.05373.
    Returns:
      A `torch.nn.Module`.
    """

    def __init__(self, only_digits=True):
        super(CNN_DropOut, self).__init__()
        self.conv2d_1 = torch.nn.Conv2d(1, 32, kernel_size=3)
        self.max_pooling = nn.MaxPool2d(2, stride=2)
        self.conv2d_2 = torch.nn.Conv2d(32, 64, kernel_size=3)
        self.dropout_1 = nn.Dropout(0.25)
        self.flatten = nn.Flatten()
        self.linear_1 = nn.Linear(9216, 128)
        self.dropout_2 = nn.Dropout(0.5)
        self.linear_2 = nn.Linear(128, 10 if only_digits else 62)
        self.relu = nn.ReLU()
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = torch.unsqueeze(x, 1)
        x = self.conv2d_1(x)
        x = self.relu(x)
        x = self.conv2d_2(x)
        x = self.relu(x)
        x = self.max_pooling(x)
        x = self.dropout_1(x)
        x = self.flatten(x)
        x = self.linear_1(x)
        x = self.relu(x)
        x = self.dropout_2(x)
        x = self.linear_2(x)
        # x = self.softmax(self.linear_2(x))
        return x

class CNN(BaseModel):
    '''This is a PyTorch model with some extra methods'''

    def __init__(self, model_config):
        super().__init__()
        self.net = CNN_DropOut(False)

    def loss(self, input: torch.Tensor) -> torch.Tensor:
        '''Performs forward step and computes the loss'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)
        criterion = nn.CrossEntropyLoss().to(device)
        return criterion(output, labels.long())

    def inference(self, input):
        '''Performs forward step and computes metrics'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)
        n_samples = features.shape[0]
        accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item()

        return {'output':output, 'acc': accuracy, 'batch_size': n_samples} 


================================================
FILE: experiments/cv_lr_mnist/README.md
================================================
## FedML Benchmark

### Examples

The example in this folder was taken from [FedML](https://github.com/FedML-AI/FedML/tree/master/python/examples/simulation/mpi_fedavg_datasets_and_models_example) repository on its release 0.7.300, using the configuration suggested on their
[benchmarking results](https://doc.fedml.ai/simulation/benchmark/BENCHMARK_MPI.html) for MPI-Based Federated Learning (fastest on this version).

### Data

FLUTE will automatically download the data used for this example, otherwise you can use the scripts provided [here](https://github.com/FedML-AI/FedML/tree/master/python/fedml/data) for each independent dataset in the FedML GitHub repository. 

### Run

If you downloaded the data manually, make sure that the variable `data_cache_dir` has been updated inside `preprocess.py`. Later, you can run the experiment as follows:

```code

    python -m torch.distributed.run  --nproc_per_node=4  e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest  -config ./experiments/cv_lr_mnist/config.yaml -task cv_lr_mnist -backend nccl
```

### FedML Results

This comparison was carried out using Parrot (Simulator) on version 0.7.303 at commit ID [8f7f261f](https://github.com/FedML-AI/FedML/tree/8f7f261f44e58d0cb5a416b0d6fa270b42a91049). 

```
 _____________________________________________________________________________
|                    |   FedML (MPI) - Fastest   |   FLUTE (NCCL)  - Fastest  |
| Task               | Acc | Time     | GPU Mem  | Acc | Time     | GPU Mem   |
|--------------------|-----|----------|----------|-----|----------|-----------|
| LR_MNIST           | ~81 | 00:03:09 | ~3060 MB | ~81 | 00:01:35 | ~1060 MB  |
| CNN_FEMNIST        | ~83 | 05:49:52 | ~5180 MB | ~83 | 00:08:22 | ~1770 MB  |
| RESNET_FEDCIFAR100 | ~34 | 15:55:36 | ~5530 MB | ~33 | 01:42:01 | ~1900 MB  |
| RNN_FEDSHAKESPEARE | ~57 | 06:46:21 | ~3690 MB | ~57 | 00:21:50 | ~1270 MB  |
 -----------------------------------------------------------------------------
```

### FedML Configuration file

In order to reproduce this experiment in FedML please use the setup below. 

```yaml

common_args:
  training_type: "simulation"
  random_seed: 0

data_args:
  dataset: "mnist"
  data_cache_dir: ~/fedml_data
  partition_method: "hetero"
  partition_alpha: 0.5

model_args:
  model: "lr"

train_args:
  federated_optimizer: "FedAvg"
  client_id_list: "[]"
  client_num_in_total: 1000
  client_num_per_round: 10
  comm_round: 100
  epochs: 1
  batch_size: 10
  client_optimizer: sgd
  learning_rate: 0.03
  weight_decay: 0.001

validation_args:
  frequency_of_the_test: 20

device_args:
  worker_num: 10
  using_gpu: true
  gpu_mapping_file: config/fedemnist_cnn/gpu_mapping.yaml
  gpu_mapping_key: mapping_default # [3, 3, 3, 2]

comm_args:
  backend: "MPI"
  is_mobile: 0

```

### Flower Results

This comparison was carried out using Flower (Simulator) on version 1.0.0 at commit ID [4e7fad9](https://github.com/adap/flower/tree/4e7fad99389a5ee511730841b61f279e3359cb16). Showing that in some cases FLUTE can outperform 53x faster.

```
 ________________________________________________
|        |    Flower (Ray)   | FLUTE (NCCL/Gloo) |
|        | Acc |    Time     | Acc |    Time     |
|--------|-----|-------------|-----|-------------|
| CPU    | ~80 |   00:30:14  | ~80 |   00:03:20  |
| GPU 2x | ~80 |   01:21:44  | ~80 |   00:01:31  |
| GPU 4x | ~79 |   00:56:45  | ~81 |   00:01:26  |
 ------------------------------------------------
```

### Flower Configuration file

In order to reproduce this experiment in Flower please use the following patch [file](https://github.com/AnonymousQTHM31/FLUTE/blob/main/flower.patch) for the CPU setup. If you want to use multiple GPUs, follow the configuration suggested [here](https://github.com/adap/flower/issues/1415)

================================================
FILE: experiments/cv_lr_mnist/config.yaml
================================================
# Basic configuration file for running classif_cnn example using torchvision CIFAR10 dataset.
# Parameters needed to initialize the model
model_config:
    model_type: LR                                   # class w/ `loss` and `inference` methods
    model_folder: experiments/cv_lr_mnist/model.py     # file containing class
    input_dim: 784
    output_dim: 10
# Configuration for differential privacy
dp_config:
    enable_local_dp: false                             # whether to enable user-level DP

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false                               # cache data to compute additional metrics

# Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx)
strategy: FedAvg

# Determines all the server-side settings for training and evaluation rounds
server_config:   
    wantRL: false                                      # whether to use RL-based meta-optimizers
    resume_from_checkpoint: false                      # restart from checkpoint if file exists
    do_profiling: false                                # run profiler and compute runtime metrics
    optimizer_config:                                  # this is the optimizer used to update the model
        type: sgd
        lr: 1.0
    annealing_config:                                  # annealer for the learning rate
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 1000                                       # how many iterations between metric eval on val set
    rec_freq: 20                                      # how many iterations between metric eval on test set
    initial_val: false
    initial_rec: false
    max_iteration: 100                               # how many iterations in total
    num_clients_per_iteration: 10                      # how many clients per iteration
    data_config:                                       # where to get val and test data from
        val:
            batch_size: 10
            val_data: null                             # Assigned to null because dataset is being instantiated
        test:
            batch_size: 10
            test_data: null                            # Assigned to null because dataset is being instantiated
    type: model_optimization
    aggregate_median: softmax                          # how aggregations weights are computed
    initial_lr_client: 0.03                           # learning rate used on client optimizer
    lr_decay_factor: 1.0
    weight_train_loss: train_loss
    best_model_criterion: loss
    fall_back_to_best_model: false
    softmax_beta: 1.0

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    do_profiling: false                                # run profiling and compute runtime metrics
    ignore_subtask: false
    data_config:                                       # where to get training data from
        train:
            batch_size: 10
            list_of_train_data: null                   # Assigned to null because dataset is being instantiated
            desired_max_samples: 5000
    optimizer_config:                                  # this is the optimizer used by the client
        type: sgd
        lr: 0.03                                      # this is overridden by `initial_lr_client`
    type: optimization

================================================
FILE: experiments/cv_lr_mnist/dataloaders/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
import numpy as np

from core.dataloader import BaseDataLoader
from experiments.cv_lr_mnist.dataloaders.dataset import Dataset

class DataLoader(BaseDataLoader):
    def __init__(self, mode, num_workers=0, **kwargs):
        args = kwargs['args']
        self.batch_size = args['batch_size']

        dataset = Dataset(
            data=kwargs['data'],
            test_only=(not mode=='train'),
            user_idx=kwargs.get('user_idx', None),
        )

        super().__init__(
            dataset,
            batch_size=self.batch_size,
            shuffle=(mode=='train'),
            num_workers=num_workers,
            collate_fn=self.collate_fn,
        )

    def collate_fn(self, batch):
        x, y = list(zip(*batch))
        x, y = np.array(x), np.array(y)
        return {'x': torch.tensor(x), 'y': torch.tensor(y)}

================================================
FILE: experiments/cv_lr_mnist/dataloaders/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
from core.dataset import BaseDataset
from experiments.cv_lr_mnist.dataloaders.preprocessing import MNIST

class Dataset(BaseDataset):
    def __init__(self, data, test_only=False, user_idx=0, **kwargs):
        self.test_only = test_only
        self.user_idx = user_idx

        # Get all data
        self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only)

        if user_idx == -1:
            self.user = self.user_list
            self.features = np.vstack([user_data for user_data in self.user_data.values()])
            self.labels = np.hstack([user_label for user_label in self.user_data_label.values()])
        else:

            if self.test_only:  # combine all data into single array
                self.user = 'test_only'
                self.features = np.vstack([user_data for user_data in self.user_data.values()])
                self.labels = np.hstack([user_label for user_label in self.user_data_label.values()])
            else:  # get a single user's data
                if user_idx is None:
                    raise ValueError('in train mode, user_idx must be specified')

                self.user = self.user_list[user_idx]
                self.features = self.user_data[self.user]
                self.labels = self.user_data_label[self.user]

    def __getitem__(self, idx):
        return np.array(self.features[idx]).astype(np.float32).T, self.labels[idx]

    def __len__(self):
        return len(self.features)

    def load_data(self, data, test_only):
        '''Wrapper method to read/instantiate the dataset'''

        if data == None:
            dataset = MNIST()
            data = dataset.testset if test_only else dataset.trainset
        
        users = data['users']
        features = data['user_data']
        labels = data['user_data_label']
        num_samples = data['num_samples']
            
        return users, features, labels, num_samples

================================================
FILE: experiments/cv_lr_mnist/dataloaders/preprocessing.py
================================================
import os
import wget
import zipfile
import numpy as np
import json

FEDML_DATA_MNIST_URL = "https://fedcv.s3.us-west-1.amazonaws.com/MNIST.zip"
data_cache_dir = "./data"

''' 
    The MNIST dataset is taken from FedML repository. For more information regarding this dataset, 
    please refer to https://github.com/FedML-AI/FedML/tree/master/python/fedml/data/MNIST.

    In order to download the data run the following commands:
        - wget --no-check-certificate --no-proxy https://fedcv.s3.us-west-1.amazonaws.com/MNIST.zip
        - unzip MNIST.zip
'''

class MNIST:
    def __init__(self) :
        
        download_mnist(data_cache_dir)
        self.trainset, self.testset = read_data(
            train_data_dir = os.path.join(data_cache_dir,'MNIST','train'),
            test_data_dir= os.path.join(data_cache_dir,'MNIST','test'),
        )
        print("Dictionaries ready ..")

def download_mnist(data_cache_dir):
    if not os.path.exists(data_cache_dir):
        os.makedirs(data_cache_dir)

    file_path = os.path.join(data_cache_dir,"MNIST.zip") 

    # Download the file (if we haven't already)
    if not os.path.exists(file_path):
        wget.download(FEDML_DATA_MNIST_URL, out=file_path)

    with zipfile.ZipFile(file_path, "r") as zip_ref:
        zip_ref.extractall(data_cache_dir)

def read_data(train_data_dir, test_data_dir):

    train_files = os.listdir(train_data_dir)
    train_files = [f for f in train_files if f.endswith(".json")]
    for f in train_files:
        file_path = os.path.join(train_data_dir, f)
        with open(file_path, "r") as inf:
            train_data = json.load(inf)

    train_data['user_data_label'] = dict()
    for user in train_data['user_data']:
        train_data['user_data_label'][user] = train_data['user_data'][user]['y']
        train_data['user_data'][user] = train_data['user_data'][user]['x']
    
    test_files = os.listdir(test_data_dir)
    test_files = [f for f in test_files if f.endswith(".json")]
    for f in test_files:
        file_path = os.path.join(test_data_dir, f)
        with open(file_path, "r") as inf:
            test_data = json.load(inf)  
        
    test_data['user_data_label'] = dict()
    for user in test_data['user_data']:
        test_data['user_data_label'][user] = test_data['user_data'][user]['y']
        test_data['user_data'][user] = test_data['user_data'][user]['x']
        
    return train_data, test_data

================================================
FILE: experiments/cv_lr_mnist/model.py
================================================
import torch
from torch import nn
from torch.nn import functional as F
from core.model import BaseModel

''' 
    The LogisticRegression model is taken from FedML repository. For more information regarding this model, 
    please refer to https://github.com/FedML-AI/FedML/blob/master/python/fedml/model/linear/lr.py.
'''


class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        o = self.linear(x.view(-1,28*28))
        outputs = torch.sigmoid(o)
        #outputs = torch.sigmoid(self.linear(x))
        return outputs

class LR(BaseModel):
    '''This is a PyTorch model with some extra methods'''

    def __init__(self, model_config):
        super().__init__()
        self.net = LogisticRegression(model_config['input_dim'], model_config['output_dim'])

    def loss(self, input: torch.Tensor) -> torch.Tensor:
        '''Performs forward step and computes the loss'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)
        criterion = nn.CrossEntropyLoss().to(device)
        return criterion(output, labels.long())

    def inference(self, input):
        '''Performs forward step and computes metrics'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)

        n_samples = features.shape[0]
        accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item()

        return {'output':output, 'acc': accuracy, 'batch_size': n_samples} 


================================================
FILE: experiments/cv_resnet_fedcifar100/README.md
================================================
## FedML Benchmark

### Examples

The example in this folder was taken from [FedML](https://github.com/FedML-AI/FedML/tree/master/python/examples/simulation/mpi_fedavg_datasets_and_models_example) repository on its release 0.7.300, using the configuration suggested on their
[benchmarking results](https://doc.fedml.ai/simulation/benchmark/BENCHMARK_MPI.html) for MPI-Based Federated Learning (fastest on this version).

### Data

FLUTE will automatically download the data used for this example, otherwise you can use the scripts provided [here](https://github.com/FedML-AI/FedML/tree/master/python/fedml/data) for each independent dataset in the FedML GitHub repository. 

### Run

If you downloaded the data manually, make sure that the variable `data_cache_dir` has been updated inside `preprocess.py`. Later, you can run the experiment as follows:

```code

    python -m torch.distributed.run  --nproc_per_node=4  e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest  -config ./experiments/cv_resnet_fedcifar100/config.yaml -task cv_resnet_fedcifar100 -backend nccl
```

### Results

This comparison was carried out using Parrot (Simulator) on version 0.7.303 at commit ID [8f7f261f](https://github.com/FedML-AI/FedML/tree/8f7f261f44e58d0cb5a416b0d6fa270b42a91049). 
```
 _____________________________________________________________________________
|                    |   FedML (MPI) - Fastest   |   FLUTE (NCCL)  - Fastest  |
| Task               | Acc | Time     | GPU Mem  | Acc | Time     | GPU Mem   |
|--------------------|-----|----------|----------|-----|----------|-----------|
| LR_MNIST           | ~81 | 00:03:09 | ~3060 MB | ~81 | 00:01:35 | ~1060 MB  |
| CNN_FEMNIST        | ~83 | 05:49:52 | ~5180 MB | ~83 | 00:08:22 | ~1770 MB  |
| RESNET_FEDCIFAR100 | ~34 | 15:55:36 | ~5530 MB | ~33 | 01:42:01 | ~1900 MB  |
| RNN_FEDSHAKESPEARE | ~57 | 06:46:21 | ~3690 MB | ~57 | 00:21:50 | ~1270 MB  |
 -----------------------------------------------------------------------------
```
### FedML Configuration file

In order to reproduce this experiment in FedML please use the setup below. 

```yaml

common_args:
  training_type: "simulation"
  random_seed: 0

data_args:
  dataset: "fed_cifar100"
  data_cache_dir: ~/fedml_data
  partition_method: "hetero"
  partition_alpha: 0.5

model_args:
  model: "resnet18_gn"

train_args:
  federated_optimizer: "FedAvg"
  client_id_list: "[]"
  client_num_in_total: 500
  client_num_per_round: 10
  comm_round: 4000
  epochs: 1
  batch_size: 20
  client_optimizer: sgd
  learning_rate: 0.1
  weight_decay: 0.001

validation_args:
  frequency_of_the_test: 50

device_args:
  worker_num: 10
  using_gpu: true
  gpu_mapping_file: config/fedcifar100_resnet18/gpu_mapping.yaml
  gpu_mapping_key: mapping_default # [3, 3, 3, 2]

comm_args:
  backend: "MPI"
  is_mobile: 0

```

================================================
FILE: experiments/cv_resnet_fedcifar100/config.yaml
================================================
# Basic configuration file for running classif_cnn example using torchvision CIFAR10 dataset.
# Parameters needed to initialize the model
model_config:
    model_type: RESNET                                # class w/ `loss` and `inference` methods
    model_folder: experiments/cv_resnet_fedcifar100/model.py     # file containing class

# Configuration for differential privacy
dp_config:
    enable_local_dp: false                             # whether to enable user-level DP

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false                               # cache data to compute additional metrics

# Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx)
strategy: FedAvg

# Determines all the server-side settings for training and evaluation rounds
server_config:   
    wantRL: false                                      # whether to use RL-based meta-optimizers
    resume_from_checkpoint: false                      # restart from checkpoint if file exists
    do_profiling: false                                # run profiler and compute runtime metrics
    optimizer_config:                                  # this is the optimizer used to update the model
        type: sgd
        lr: 1.0
    annealing_config:                                  # annealer for the learning rate
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 50000                                       # how many iterations between metric eval on val set
    rec_freq: 50                                     # how many iterations between metric eval on test set
    initial_val: false
    initial_rec: false
    max_iteration: 4000                               # how many iterations in total
    num_clients_per_iteration: 10                      # how many clients per iteration
    data_config:                                       # where to get val and test data from
        val:
            batch_size: 20
            val_data: null                             # Assigned to null because dataset is being instantiated
        test:
            batch_size: 20
            test_data: null                            # Assigned to null because dataset is being instantiated
    type: model_optimization
    aggregate_median: softmax                          # how aggregations weights are computed
    initial_lr_client: 0.1                           # learning rate used on client optimizer
    lr_decay_factor: 1.0
    weight_train_loss: train_loss
    best_model_criterion: loss
    fall_back_to_best_model: false
    softmax_beta: 1.0

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    do_profiling: false                                # run profiling and compute runtime metrics
    ignore_subtask: false
    data_config:                                       # where to get training data from
        train:
            batch_size: 20
            list_of_train_data: null                   # Assigned to null because dataset is being instantiated
            desired_max_samples: 5000
    optimizer_config:                                  # this is the optimizer used by the client
        type: sgd
        lr: 0.1                                      # this is overridden by `initial_lr_client`
    type: optimization

================================================
FILE: experiments/cv_resnet_fedcifar100/dataloaders/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
import numpy as np

from core.dataloader import BaseDataLoader
from experiments.cv_resnet_fedcifar100.dataloaders.dataset import Dataset

class DataLoader(BaseDataLoader):
    def __init__(self, mode, num_workers=0, **kwargs):
        args = kwargs['args']
        self.batch_size = args['batch_size']

        dataset = Dataset(
            data=kwargs['data'],
            test_only=(not mode=='train'),
            user_idx=kwargs.get('user_idx', None),
        )

        super().__init__(
            dataset,
            batch_size=self.batch_size,
            shuffle=(mode=='train'),
            num_workers=num_workers,
            collate_fn=self.collate_fn,
        )

    def collate_fn(self, batch):
        x, y = list(zip(*batch))
        x, y = np.array(x), np.array(y)
        return {'x': torch.tensor(x), 'y': torch.tensor(y)}

================================================
FILE: experiments/cv_resnet_fedcifar100/dataloaders/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
from core.dataset import BaseDataset
from experiments.cv_resnet_fedcifar100.dataloaders.preprocessing import FEDCIFAR100

class Dataset(BaseDataset):
    def __init__(self, data, test_only=False, user_idx=0, **kwargs):
        self.test_only = test_only
        self.user_idx = user_idx

        # Get all data
        self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only)

        if user_idx == -1:
            self.user = 'test_only'
            self.features = np.vstack([user_data for user_data in self.user_data.values()])
            self.labels = np.hstack([user_label for user_label in self.user_data_label.values()])       
        else:
            if self.test_only:  # combine all data into single array
                self.user = 'test_only'
                self.features = np.vstack([user_data for user_data in self.user_data.values()])
                self.labels = np.hstack([user_label for user_label in self.user_data_label.values()])
            else:  # get a single user's data
                if user_idx is None:
                    raise ValueError('in train mode, user_idx must be specified')

                self.user = self.user_list[user_idx]
                self.features = self.user_data[self.user]
                self.labels = self.user_data_label[self.user]

    def __getitem__(self, idx):
        return np.array(self.features[idx]).astype(np.float32).T, self.labels[idx]

    def __len__(self):
        return len(self.features)

    def load_data(self, data, test_only):
        '''Wrapper method to read/instantiate the dataset'''

        if data == None:
            dataset = FEDCIFAR100()
            data = dataset.testset if test_only else dataset.trainset
        
        users = data['users']
        features = data['user_data']
        labels = data['user_data_label']
        num_samples = data['num_samples']
            
        return users, features, labels, num_samples

================================================
FILE: experiments/cv_resnet_fedcifar100/dataloaders/preprocessing.py
================================================
import os
import wget
import zipfile
import tarfile
import h5py

data_cache_dir = "./data"
DEFAULT_TRAIN_FILE = "fed_cifar100_train.h5"
DEFAULT_TEST_FILE = "fed_cifar100_test.h5"

''' 
    The FedCIFAR100 dataset is taken from FedML repository. For more information regarding this dataset, 
    please refer to https://github.com/FedML-AI/FedML/tree/master/python/fedml/data/fed_cifar100.

    In order to download the data run the following commands:
        - wget --no-check-certificate --no-proxy https://fedml.s3-us-west-1.amazonaws.com/fed_cifar100.tar.bz2
        - tar -xvf fed_cifar100.tar.bz2
'''

class FEDCIFAR100:
    def __init__(self) :

        download_files(data_cache_dir)

        # Preprocess datasets
        train_h5 = h5py.File(os.path.join(data_cache_dir,'fed_cifar100', DEFAULT_TRAIN_FILE), "r")
        test_h5 = h5py.File(os.path.join(data_cache_dir, 'fed_cifar100',DEFAULT_TEST_FILE), "r")
        test_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()}
        train_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()}

        for user in test_h5['examples'].keys():
            test_dict['users'].append(user)
            test_dict['num_samples'].append(len(test_h5['examples'][user]['image'][()]))
            test_dict['user_data'][user] = test_h5['examples'][user]['image'][()]
            test_dict['user_data_label'][user] = test_h5['examples'][user]['label'][()]
            
        for user in train_h5['examples'].keys():
            train_dict['users'].append(user)
            train_dict['num_samples'].append(len(train_h5['examples'][user]['image'][()]))
            train_dict['user_data'][user] = train_h5['examples'][user]['image'][()]
            train_dict['user_data_label'][user] = train_h5['examples'][user]['label'][()]

        print(" Dictionaries ready .. ")
        self.trainset, self.testset = train_dict, test_dict

def download_files(data_cache_dir):

    URL = "https://fedml.s3-us-west-1.amazonaws.com/fed_cifar100.tar.bz2"

    if not os.path.exists(data_cache_dir):
        os.makedirs(data_cache_dir)

    file_path = os.path.join(data_cache_dir,"fed_cifar100.tar.bz2") 

    # Download and decompress the file (if we haven't already)
    if not os.path.exists(file_path):
        wget.download(URL, out=file_path)

        file = tarfile.open(file_path)
        file.extractall(os.path.join(data_cache_dir,'fed_cifar100'))
        file.close()


================================================
FILE: experiments/cv_resnet_fedcifar100/group_normalization.py
================================================
import torch.nn.functional as F
from torch.nn.modules.batchnorm import _BatchNorm

""" This group normalization script was taken from FedML repository. For more information please refer to
    https://github.com/FedML-AI/FedML/blob/master/python/fedml/model/cv/group_normalization.py.

    Pytorch implementation of group normalization in https://arxiv.org/abs/1803.08494 (Following the PyTorch Style)
"""

def group_norm(
    input,
    group,
    running_mean,
    running_var,
    weight=None,
    bias=None,
    use_input_stats=True,
    momentum=0.1,
    eps=1e-5,
):
    """Applies Group Normalization for channels in the same group in each data sample in a
    batch.
    See :class:`~torch.nn.GroupNorm1d`, :class:`~torch.nn.GroupNorm2d`,
    :class:`~torch.nn.GroupNorm3d` for details.
    """
    if not use_input_stats and (running_mean is None or running_var is None):
        raise ValueError(
            "Expected running_mean and running_var to be not None when use_input_stats=False"
        )

    b, c = input.size(0), input.size(1)
    if weight is not None:
        weight = weight.repeat(b)
    if bias is not None:
        bias = bias.repeat(b)

    def _instance_norm(
        input,
        group,
        running_mean=None,
        running_var=None,
        weight=None,
        bias=None,
        use_input_stats=None,
        momentum=None,
        eps=None,
    ):
        # Repeat stored stats and affine transform params if necessary
        if running_mean is not None:
            running_mean_orig = running_mean
            running_mean = running_mean_orig.repeat(b)
        if running_var is not None:
            running_var_orig = running_var
            running_var = running_var_orig.repeat(b)

        # norm_shape = [1, b * c / group, group]
        # print(norm_shape)
        # Apply instance norm
        input_reshaped = input.contiguous().view(
            1, int(b * c / group), group, *input.size()[2:]
        )

        out = F.batch_norm(
            input_reshaped,
            running_mean,
            running_var,
            weight=weight,
            bias=bias,
            training=use_input_stats,
            momentum=momentum,
            eps=eps,
        )

        # Reshape back
        if running_mean is not None:
            running_mean_orig.copy_(
                running_mean.view(b, int(c / group)).mean(0, keepdim=False)
            )
        if running_var is not None:
            running_var_orig.copy_(
                running_var.view(b, int(c / group)).mean(0, keepdim=False)
            )

        return out.view(b, c, *input.size()[2:])

    return _instance_norm(
        input,
        group,
        running_mean=running_mean,
        running_var=running_var,
        weight=weight,
        bias=bias,
        use_input_stats=use_input_stats,
        momentum=momentum,
        eps=eps,
    )


class _GroupNorm(_BatchNorm):
    def __init__(
        self,
        num_features,
        num_groups=1,
        eps=1e-5,
        momentum=0.1,
        affine=False,
        track_running_stats=False,
    ):
        self.num_groups = num_groups
        self.track_running_stats = track_running_stats
        super(_GroupNorm, self).__init__(
            int(num_features / num_groups), eps, momentum, affine, track_running_stats
        )

    def _check_input_dim(self, input):
        return NotImplemented

    def forward(self, input):
        self._check_input_dim(input)

        return group_norm(
            input,
            self.num_groups,
            self.running_mean,
            self.running_var,
            self.weight,
            self.bias,
            self.training or not self.track_running_stats,
            self.momentum,
            self.eps,
        )


class GroupNorm2d(_GroupNorm):
    r"""Applies Group Normalization over a 4D input (a mini-batch of 2D inputs
    with additional channel dimension) as described in the paper
    https://arxiv.org/pdf/1803.08494.pdf
    `Group Normalization`_ .
    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, H, W)`
        num_groups:
        eps: a value added to the denominator for numerical stability. Default: 1e-5
        momentum: the value used for the running_mean and running_var computation. Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics and always uses batch
            statistics in both training and eval modes. Default: ``False``
    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)
    Examples:
        >>> # Without Learnable Parameters
        >>> m = GroupNorm2d(100, 4)
        >>> # With Learnable Parameters
        >>> m = GroupNorm2d(100, 4, affine=True)
        >>> input = torch.randn(20, 100, 35, 45)
        >>> output = m(input)
    """

    def _check_input_dim(self, input):
        if input.dim() != 4:
            raise ValueError("expected 4D input (got {}D input)".format(input.dim()))


class GroupNorm3d(_GroupNorm):
    """
    Assume the data format is (B, C, D, H, W)
    """

    def _check_input_dim(self, input):
        if input.dim() != 5:
            raise ValueError("expected 5D input (got {}D input)".format(input.dim()))


================================================
FILE: experiments/cv_resnet_fedcifar100/model.py
================================================
import math
import torch
import torch.nn as nn
import torch.utils.model_zoo as model_zoo
from torch.nn import functional as F

from experiments.cv_resnet_fedcifar100.group_normalization import GroupNorm2d
from core.model import BaseModel

''' 
    The ResNet models are taken from FedML repository. For more information regarding this model, 
    please refer to https://github.com/FedML-AI/FedML/blob/master/python/fedml/model/cv/resnet_gn.py.
'''


__all__ = ["ResNet", "resnet18", "resnet34", "resnet50", "resnet101", "resnet152"]

model_urls = {
    "resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth",
    "resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth",
    "resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth",
    "resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth",
    "resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth",
}

def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(
        in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False
    )


def norm2d(planes, num_channels_per_group=32):
    print("num_channels_per_group:{}".format(num_channels_per_group))
    if num_channels_per_group > 0:
        return GroupNorm2d(
            planes, num_channels_per_group, affine=True, track_running_stats=False
        )
    else:
        return nn.BatchNorm2d(planes)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, group_norm=0):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm2d(planes, group_norm)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm2d(planes, group_norm)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, group_norm=0):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = norm2d(planes, group_norm)
        self.conv2 = nn.Conv2d(
            planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
        )
        self.bn2 = norm2d(planes, group_norm)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = norm2d(planes * 4, group_norm)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=1000, group_norm=0):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = norm2d(64, group_norm)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], group_norm=group_norm)
        self.layer2 = self._make_layer(
            block, 128, layers[1], stride=2, group_norm=group_norm
        )
        self.layer3 = self._make_layer(
            block, 256, layers[2], stride=2, group_norm=group_norm
        )
        self.layer4 = self._make_layer(
            block, 512, layers[3], stride=2, group_norm=group_norm
        )
        # self.avgpool = nn.AvgPool2d(7, stride=1)
        self.avgpool = nn.AvgPool2d(1)
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2.0 / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, GroupNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

        for m in self.modules():
            if isinstance(m, Bottleneck):
                m.bn3.weight.data.fill_(0)
            if isinstance(m, BasicBlock):
                m.bn2.weight.data.fill_(0)

    def _make_layer(self, block, planes, blocks, stride=1, group_norm=0):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(
                    self.inplanes,
                    planes * block.expansion,
                    kernel_size=1,
                    stride=stride,
                    bias=False,
                ),
                norm2d(planes * block.expansion, group_norm),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, group_norm))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, group_norm=group_norm))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def resnet18(pretrained=False, **kwargs):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls["resnet18"]))
    return model


def resnet34(pretrained=False, **kwargs):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls["resnet34"]))
    return model


def resnet50(pretrained=False, **kwargs):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls["resnet50"]))
    return model


def resnet101(pretrained=False, **kwargs):
    """Constructs a ResNet-101 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls["resnet101"]))
    return model


def resnet152(pretrained=False, **kwargs):
    """Constructs a ResNet-152 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls["resnet152"]))
    return model

class RESNET(BaseModel):
    '''This is a PyTorch model with some extra methods'''

    def __init__(self, model_config):
        super().__init__()
        self.net = resnet18()

    def loss(self, input: torch.Tensor) -> torch.Tensor:
        '''Performs forward step and computes the loss'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)

        return F.cross_entropy(output, labels.long())

    def inference(self, input):
        '''Performs forward step and computes metrics'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)

        n_samples = features.shape[0]
        accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item()

        return {'output':output, 'acc': accuracy, 'batch_size': n_samples} 

================================================
FILE: experiments/ecg_cnn/.gitignore
================================================
./data
./raw_data
*.hdf5
*.json

================================================
FILE: experiments/ecg_cnn/centralized_model.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "gather": {
          "logged": 1644332992250
        }
      },
      "outputs": [],
      "source": [
        "# Example running CL taken from:\n",
        "# https://www.kaggle.com/polomarco/ecg-classification-cnn-lstm-attention-mechanism"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 23,
      "metadata": {
        "gather": {
          "logged": 1644397182872
        }
      },
      "outputs": [],
      "source": [
        "import csv\n",
        "import time\n",
        "\n",
        "import numpy as np # linear algebra\n",
        "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
        "\n",
        "import torch\n",
        "import torch.nn as nn\n",
        "import matplotlib.pyplot as plt\n",
        "import torch.nn.functional as F\n",
        "from torch.utils.data import Dataset, DataLoader\n",
        "from torch.optim import AdamW, SGD\n",
        "from torch.optim.lr_scheduler import StepLR\n",
        "\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "gather": {
          "logged": 1644332993422
        }
      },
      "outputs": [],
      "source": [
        "class Config:\n",
        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "    train_csv_path = './raw_data/mitbih_train.csv'\n",
        "    test_csv_path = './raw_data/mitbih_test.csv'\n",
        "    seed = 123\n",
        "config = Config"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "jupyter": {
          "outputs_hidden": false,
          "source_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "gather": {
          "logged": 1644332993546
        }
      },
      "outputs": [],
      "source": [
        "class ECGDataset(Dataset):\n",
        "\n",
        "    def __init__(self, df):\n",
        "        self.df = df\n",
        "        self.data_columns = self.df.columns[:-2].tolist()\n",
        "\n",
        "    def __getitem__(self, idx):\n",
        "        signal = self.df.loc[idx, self.data_columns].astype('float32')\n",
        "        signal = torch.FloatTensor([signal.values])                 \n",
        "        target = torch.LongTensor(np.array(self.df.loc[idx, 'class']))\n",
        "        return signal, target\n",
        "\n",
        "    def __len__(self):\n",
        "        return len(self.df)\n",
        "\n",
        "id_to_label = {\n",
        "    0: \"Normal\",\n",
        "    1: \"Artial Premature\",\n",
        "    2: \"Premature ventricular contraction\",\n",
        "    3: \"Fusion of ventricular and normal\",\n",
        "    4: \"Fusion of paced and normal\"\n",
        "}\n",
        "\n",
        "def get_dataloader(phase: str, batch_size: int = 96) -> DataLoader:\n",
        "    '''\n",
        "    Dataset and DataLoader.\n",
        "    Parameters:\n",
        "        pahse: training or validation phase.\n",
        "        batch_size: data per iteration.\n",
        "    Returns:\n",
        "        data generator\n",
        "    '''\n",
        "    df = pd.read_csv(config.train_csv_path, header=None)\n",
        "    df.rename(columns={187: 'class'}, inplace=True)\n",
        "    df['label'] = df.iloc[:, -1].map(id_to_label)\n",
        "    train_df, val_df = train_test_split(\n",
        "        df, test_size=0.15, random_state=config.seed, stratify=df['label']\n",
        "    )\n",
        "    train_df, val_df = train_df.reset_index(drop=True), val_df.reset_index(drop=True)\n",
        "    df = train_df if phase == 'train' else val_df\n",
        "    dataset = ECGDataset(df)\n",
        "    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=4)\n",
        "    return dataloader"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {
        "gather": {
          "logged": 1644332993675
        }
      },
      "outputs": [],
      "source": [
        "class Swish(nn.Module):\n",
        "    def forward(self, x):\n",
        "        return x * torch.sigmoid(x)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {
        "gather": {
          "logged": 1644332993801
        }
      },
      "outputs": [],
      "source": [
        "class ConvNormPool(nn.Module):\n",
        "    \"\"\"Conv Skip-connection module\"\"\"\n",
        "    def __init__(\n",
        "        self,\n",
        "        input_size,\n",
        "        hidden_size,\n",
        "        kernel_size,\n",
        "        norm_type='bachnorm'\n",
        "    ):\n",
        "        super().__init__()\n",
        "        \n",
        "        self.kernel_size = kernel_size\n",
        "        self.conv_1 = nn.Conv1d(\n",
        "            in_channels=input_size,\n",
        "            out_channels=hidden_size,\n",
        "            kernel_size=kernel_size\n",
        "        )\n",
        "        self.conv_2 = nn.Conv1d(\n",
        "            in_channels=hidden_size,\n",
        "            out_channels=hidden_size,\n",
        "            kernel_size=kernel_size\n",
        "        )\n",
        "        self.conv_3 = nn.Conv1d(\n",
        "            in_channels=hidden_size,\n",
        "            out_channels=hidden_size,\n",
        "            kernel_size=kernel_size\n",
        "        )\n",
        "        self.swish_1 = Swish()\n",
        "        self.swish_2 = Swish()\n",
        "        self.swish_3 = Swish()\n",
        "        if norm_type == 'group':\n",
        "            self.normalization_1 = nn.GroupNorm(\n",
        "                num_groups=8,\n",
        "                num_channels=hidden_size\n",
        "            )\n",
        "            self.normalization_2 = nn.GroupNorm(\n",
        "                num_groups=8,\n",
        "                num_channels=hidden_size\n",
        "            )\n",
        "            self.normalization_3 = nn.GroupNorm(\n",
        "                num_groups=8,\n",
        "                num_channels=hidden_size\n",
        "            )\n",
        "        else:\n",
        "            self.normalization_1 = nn.BatchNorm1d(num_features=hidden_size)\n",
        "            self.normalization_2 = nn.BatchNorm1d(num_features=hidden_size)\n",
        "            self.normalization_3 = nn.BatchNorm1d(num_features=hidden_size)\n",
        "            \n",
        "        self.pool = nn.MaxPool1d(kernel_size=2)\n",
        "        \n",
        "    def forward(self, input):\n",
        "        conv1 = self.conv_1(input)\n",
        "        x = self.normalization_1(conv1)\n",
        "        x = self.swish_1(x)\n",
        "        x = F.pad(x, pad=(self.kernel_size - 1, 0))\n",
        "        \n",
        "        x = self.conv_2(x)\n",
        "        x = self.normalization_2(x)\n",
        "        x = self.swish_2(x)\n",
        "        x = F.pad(x, pad=(self.kernel_size - 1, 0))\n",
        "        \n",
        "        conv3 = self.conv_3(x)\n",
        "        x = self.normalization_3(conv1+conv3)\n",
        "        x = self.swish_3(x)\n",
        "        x = F.pad(x, pad=(self.kernel_size - 1, 0))   \n",
        "        \n",
        "        x = self.pool(x)\n",
        "        return x"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "gather": {
          "logged": 1644332993953
        }
      },
      "outputs": [],
      "source": [
        "class RNN(nn.Module):\n",
        "    \"\"\"RNN module(cell type lstm or gru)\"\"\"\n",
        "    def __init__(\n",
        "        self,\n",
        "        input_size,\n",
        "        hid_size,\n",
        "        num_rnn_layers=1,\n",
        "        dropout_p = 0.2,\n",
        "        bidirectional = False,\n",
        "        rnn_type = 'lstm',\n",
        "    ):\n",
        "        super().__init__()\n",
        "        \n",
        "        if rnn_type == 'lstm':\n",
        "            self.rnn_layer = nn.LSTM(\n",
        "                input_size=input_size,\n",
        "                hidden_size=hid_size,\n",
        "                num_layers=num_rnn_layers,\n",
        "                dropout=dropout_p if num_rnn_layers>1 else 0,\n",
        "                bidirectional=bidirectional,\n",
        "                batch_first=True,\n",
        "            )\n",
        "            \n",
        "        else:\n",
        "            self.rnn_layer = nn.GRU(\n",
        "                input_size=input_size,\n",
        "                hidden_size=hid_size,\n",
        "                num_layers=num_rnn_layers,\n",
        "                dropout=dropout_p if num_rnn_layers>1 else 0,\n",
        "                bidirectional=bidirectional,\n",
        "                batch_first=True,\n",
        "            )\n",
        "    def forward(self, input):\n",
        "        outputs, hidden_states = self.rnn_layer(input)\n",
        "        return outputs, hidden_states"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "gather": {
          "logged": 1644332994075
        }
      },
      "outputs": [],
      "source": [
        "class RNNAttentionModel(nn.Module):\n",
        "    def __init__(\n",
        "        self,\n",
        "        input_size,\n",
        "        hid_size,\n",
        "        rnn_type,\n",
        "        bidirectional,\n",
        "        n_classes=5,\n",
        "        kernel_size=5,\n",
        "    ):\n",
        "        super().__init__()\n",
        " \n",
        "        self.rnn_layer = RNN(\n",
        "            input_size=46,\n",
        "            hid_size=hid_size,\n",
        "            rnn_type=rnn_type,\n",
        "            bidirectional=bidirectional\n",
        "        )\n",
        "        self.conv1 = ConvNormPool(\n",
        "            input_size=input_size,\n",
        "            hidden_size=hid_size,\n",
        "            kernel_size=kernel_size,\n",
        "        )\n",
        "        self.conv2 = ConvNormPool(\n",
        "            input_size=hid_size,\n",
        "            hidden_size=hid_size,\n",
        "            kernel_size=kernel_size,\n",
        "        )\n",
        "        self.avgpool = nn.AdaptiveMaxPool1d((1))\n",
        "        self.attn = nn.Linear(hid_size, hid_size, bias=False)\n",
        "        self.fc = nn.Linear(in_features=hid_size, out_features=n_classes)\n",
        "        \n",
        "    def forward(self, input):\n",
        "        x = self.conv1(input)\n",
        "        x = self.conv2(x)\n",
        "        x_out, hid_states = self.rnn_layer(x)\n",
        "        x = torch.cat([hid_states[0], hid_states[1]], dim=0).transpose(0, 1)\n",
        "        x_attn = torch.tanh(self.attn(x))\n",
        "        x = x_attn.bmm(x_out)\n",
        "        x = x.transpose(2, 1)\n",
        "        x = self.avgpool(x)\n",
        "        x = x.view(-1, x.size(1) * x.size(2))\n",
        "        x = F.softmax(self.fc(x), dim=-1)\n",
        "        return x"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "gather": {
          "logged": 1644332994213
        }
      },
      "outputs": [],
      "source": [
        "class Meter:\n",
        "    def __init__(self, n_classes=5):\n",
        "        self.metrics = {}\n",
        "        self.confusion = torch.zeros((n_classes, n_classes))\n",
        "    \n",
        "    def update(self, x, y, loss):\n",
        "        x = np.argmax(x.detach().cpu().numpy(), axis=1)\n",
        "        y = y.detach().cpu().numpy()\n",
        "        # print('here!', recall_score(x,y, average='macro', zero_division=1))\n",
        "        self.metrics['loss'] += loss\n",
        "        self.metrics['accuracy'] += accuracy_score(x,y)\n",
        "        self.metrics['f1'] += f1_score(x,y,average='macro')\n",
        "        self.metrics['precision'] += precision_score(x, y, average='macro', zero_division=1)\n",
        "        self.metrics['recall'] += recall_score(x,y, average='macro', zero_division=1)\n",
        "        \n",
        "        self._compute_cm(x, y)\n",
        "        \n",
        "    def _compute_cm(self, x, y):\n",
        "        for prob, target in zip(x, y):\n",
        "            if prob == target:\n",
        "                self.confusion[target][target] += 1\n",
        "            else:\n",
        "                self.confusion[target][prob] += 1\n",
        "    \n",
        "    def init_metrics(self):\n",
        "        self.metrics['loss'] = 0\n",
        "        self.metrics['accuracy'] = 0\n",
        "        self.metrics['f1'] = 0\n",
        "        self.metrics['precision'] = 0\n",
        "        self.metrics['recall'] = 0\n",
        "        \n",
        "    def get_metrics(self):\n",
        "        return self.metrics\n",
        "    \n",
        "    def get_confusion_matrix(self):\n",
        "        return self.confusion"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 24,
      "metadata": {
        "gather": {
          "logged": 1644397187037
        }
      },
      "outputs": [],
      "source": [
        "class Trainer:\n",
        "    def __init__(self, net, lr, batch_size, num_epochs):\n",
        "        self.net = net.to(config.device)\n",
        "        self.num_epochs = num_epochs\n",
        "        self.criterion = nn.CrossEntropyLoss(weight=torch.tensor([1,3,3,4,12]).float().to(config.device))\n",
        "        # self.optimizer = AdamW(self.net.parameters(), lr=lr)\n",
        "        self.optimizer = SGD(self.net.parameters(), lr=lr)\n",
        "        # self.scheduler = CosineAnnealingLR(self.optimizer, T_max=num_epochs, eta_min=5e-6)\n",
        "        self.scheduler = StepLR(self.optimizer, step_size=100, gamma=1.0)\n",
        "        self.best_loss = float('inf')\n",
        "        self.phases = ['train', 'val']\n",
        "        self.dataloaders = {\n",
        "            phase: get_dataloader(phase, batch_size) for phase in self.phases\n",
        "        }\n",
        "        self.train_df_logs = pd.DataFrame()\n",
        "        self.val_df_logs = pd.DataFrame()\n",
        "    \n",
        "    def _train_epoch(self, phase):\n",
        "        print(f\"{phase} mode | time: {time.strftime('%H:%M:%S')}\")\n",
        "        \n",
        "        self.net.train() if phase == 'train' else self.net.eval()\n",
        "        meter = Meter()\n",
        "        meter.init_metrics()\n",
        "        \n",
        "        for i, (data, target) in enumerate(self.dataloaders[phase]):\n",
        "            data = data.to(config.device)\n",
        "            target = target.to(config.device)\n",
        "            \n",
        "            output = self.net(data).to(config.device)\n",
        "            loss = self.criterion(output.to(config.device), target.to(config.device))\n",
        "                        \n",
        "            if phase == 'train':\n",
        "                self.optimizer.zero_grad()\n",
        "                loss.backward()\n",
        "                self.optimizer.step()\n",
        "            \n",
        "            meter.update(output, target, loss.item())\n",
        "        \n",
        "        metrics = meter.get_metrics()\n",
        "        metrics = {k:v / i for k, v in metrics.items()}\n",
        "        df_logs = pd.DataFrame([metrics])\n",
        "        confusion_matrix = meter.get_confusion_matrix()\n",
        "        \n",
        "        if phase == 'train':\n",
        "            self.train_df_logs = pd.concat([self.train_df_logs, df_logs], axis=0)\n",
        "        else:\n",
        "            self.val_df_logs = pd.concat([self.val_df_logs, df_logs], axis=0)\n",
        "        \n",
        "        # show logs\n",
        "        print('{}: {}, {}: {}, {}: {}, {}: {}, {}: {}'\n",
        "              .format(*(x for kv in metrics.items() for x in kv))\n",
        "             )\n",
        "        fig, ax = plt.subplots(figsize=(5, 5))\n",
        "        cm_ = ax.imshow(confusion_matrix, cmap='hot')\n",
        "        ax.set_title('Confusion matrix', fontsize=15)\n",
        "        ax.set_xlabel('Actual', fontsize=13)\n",
        "        ax.set_ylabel('Predicted', fontsize=13)\n",
        "        plt.colorbar(cm_)\n",
        "        plt.show()\n",
        "        \n",
        "        return loss\n",
        "    \n",
        "    def run(self):\n",
        "        for epoch in range(self.num_epochs):\n",
        "            self._train_epoch(phase='train')\n",
        "            with torch.no_grad():\n",
        "                val_loss = self._train_epoch(phase='val')\n",
        "                self.scheduler.step()\n",
        "            \n",
        "            if val_loss < self.best_loss:\n",
        "                self.best_loss = val_loss\n",
        "                print('\\nNew checkpoint\\n')\n",
        "                self.best_loss = val_loss\n",
        "                torch.save(self.net.state_dict(), f\"best_model_epoc{epoch}.pth\")\n",
        "            #clear_output()\n",
        "        "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 25,
      "metadata": {
        "gather": {
          "logged": 1644397187238
        }
      },
      "outputs": [],
      "source": [
        "attn_model = RNNAttentionModel(1, 64, 'lstm', False)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n",
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n",
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n",
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n",
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n",
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n",
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "train mode | time: 09:00:00\n",
            "loss: 1.5601879076803884, accuracy: 0.07522580645161298, f1: 0.06748332740742076, precision: 0.3165194883003588, recall: 0.2457613616641853\n",
            "val mode | time: 09:00:31\n"
          ]
        },
        {
          "data": {
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAU4AAAEnCAYAAADGqKr7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3de7hdVX3u8e9riEAFROSeoFABK1DFknJQ2nMolJJWKuiRGvsoaYvGUmixh7aCx1at91bQYpVHFAqoJVC8gFTAyEUPFgIJIhACGiXKhkAM10AlGHjPH3MsXdnZt7mYa629st7P88xnzzXWvPz23tm//MYYc84l20RExNQ9p98BREQMmiTOiIiakjgjImpK4oyIqCmJMyKips36HUBEbBrmzp3rNWvW1Npn6dKlV9qe26WQuiaJMyIasWbNGpYsWVJrH0nbdymcrkrijIiGGFjf7yB6IokzIhqUxBkRUUMqzoiImpI4IyJqSuKMiKhpeBJnLoDvMkmvl3S1pEckrZP0fUkf6NZlGJIOlnSzpCclNfboK0nvlVTvIr1pTNICSUfX2P5cSfWutRk6rcRZZxlMqTi7SNJpwDuAfwM+DjwG7AP8ObAv8LounPYzwGrgCGBdg8f9HPC1Bo/XbwuA24GvTnH79wNbdi+cTcXgJsM6UnF2iaQ/BP4P8Dbbb7X9Ndvfsn0m8BvAWV069a8BXy3nuqGpg9oesb20qeMNCklbAtj+oe3b+x3P9Gbg6ZrLxCRtIelGSd+TtEzS+0r7dpIWSfpB+fqCtn1OlbRC0l2SjmhrP0DSbeW9MySptG8u6cLSvljS7pPFlcTZPX8N3Gz7nNFv2H7a9uWt15K2l3SepAcl/bekayXNad9H0kpJH5P015JGJD0saaGkbcv7h5Su+QzgXyRZ0rnlPUs6cdTxNuh6S9pW0uck3Ve6+T+R9Nnxti9te0j6qqTHJK2V9DVJe47axpJOkvQhST+VtFrSpyRtPtEPr9U1lvQaSXeUn8t/lj+YPSVdI+mJss3LR+17sqSbJD0q6YHRcUm6FjgAmF/is6Q/afs5nybp7yWNUPUSNuqqS7pM0p2txNp23icl7TvR97bp6kpXfR1wqO1XAPsDcyUdBJwCXGV7L+Cq8hpJ+wDzqHp0c4FPS5pRjnUmVU9jr7K0bvU8DnjY9p5UPcOPThZUEmcXSJoJvBq4Yoq7fJWqa/03wBupfi/XjE5CwB8Bh1H98t8JHAl8qLx3M/Cqsn5aWX9/jbBPB36LKuEfAbyL6i9hTCXxXQW8DHgb8CfAHsC3JG03avOTgV2BNwP/DLwdOGkKMb0I+Efg3VTf86upKvWFZXkD1XDTwlb1UMwG/hU4qsQ2A/iOpOeX9/8CuBP4OtXP6VXAf7bt/8fA/yrbvXGc2N4G7AB8GEDSy4APAO+xvWwK39smqPnE6crj5eXMspjqd3teaT8PaI1XHwUstL3O9t3ACuBASbsA29i+3tXHXpw/ap/WsS4GDhv172kjGePsjhcCmwM/mWxDSXOBg4FDbH+rtF0NrAT+lirJtPwcONr2+rJd63/Xv7D9GHBD+X2v7KCbfiDwKdsXtrV9YYLt/5Qqse1t+0clnsXAj0rMH27bdqXtPynrV0o6GHg98E+TxLQd8CrbPyzHfznVz2S+7fNLm6iS3q8BywFs/3XrAKXaWEQ17nsUcL7tOyQ9Afx0gp/TkbafHC8w26tKFf9FSV8r3+93gY9N8j3FhrbXhpNuZ9neYBir/A6XAntS/RtdLGkn26vgF7+LHcvms4D23+lIaft5WR/d3trnnnKs9ZIepfobHncyNImzu6Yyq30g1R/wt36xk/2EpMuoKsB217SSZnEHsKOk59p+6lnGegvwt5KeBr5p+/tTiPvmVtIscY9I+s4YcX9j1Os7gDlMbmUraRYryterx2ibRUmcpSv3fqqx5Pbqd+8pnBOqLuC4SbPF9gWSXk+VuJ8BXmF78oG7TVrtyaE1tif8t1B+pvuXYamvSNpvgs3HqhQ9QftE+4wrXfXueJBqbOZFU9h2F+CBMdofYMM/eoBHRr1+iuqX/ty6AY7hRKohg38A7lI16D5vgu2fbdxbTCGmsfYb3d5q2wJA0ouoErWoKt+Dgd+kqjinck4Y+/sazwVUvYtFtn9QY79NUHcvR7L9CHAt1djkA6X7Tfm6umw2AuzWttts4L7SPnuM9g32kbQZ8HzgoYliSeLsAts/B75DNVY4mVXAjmO078Qkv7wa1rFxct0gudl+xPZf2d4ZeAWwmKobus84x+xF3J2YC/wKcJTti23/F1U1PTqZT2RK179K2oZqMuG7wGvVNoM7nJpPnJJ2aJsA3RL4Xarx6UuB+WWz+cAlZf1SYJ6qmfI9qCaBbizd+rWSDirDO8eO2qd1rDcAV3uSj/9N4uyeTwBzJM0f/Yak55SxTagS1I6S/mfb+78CvAa4rqFYRqgmcX5xfuDQ8Ta2fSvVWOJzqMYOx7IYOKD842wddxbVBE5TcXdiS6puc/tf5R+x8bDUVKveiXyCauLpUODfgc+1TUANoa5UnLtQTZTeCtxEVdlfBnwEOFzSD4DDy2vKxNxFVMNBVwAntA2fHE91PfIK4IdA68qWs4EXSlpBdQnhKZMFlTHOLrH9NUmnA2eXyZBLgMepEtGfU03+XGH7yjIueKGkU6i6+X9DlQD+uaFwvgKcIOm7VJM3bwW2ad9A0nVlu9up/gLeBjwB3DjOMc+lmtm/XNI/UF2U916qAfXPNBR3J66mSmb/JulsqstS/oaNu/13AkeUKvFB4G7bD071JJKOpJog+33bj0j6S6qf3b9QXWEwhJq/5bL8J/7KMdofpLrCZKx9Pgh8cIz2JcBG46NlPPuYOnGl4uwi2ydTXc6yF1VFsojq0pyrqP73a3ldee8TwH9Qjc8dansFzXhfOe4HqBLeLcDo60uvp/qDv5jqf+ztqZLCCGOwvY5fdpvOprqc48dUVwf0ratu+zaqhPY/gMuoLi06Bnh01KYfoJpMuoiqkvnDqZ6jXG51FvBZ21eU8z5E9Z/NfFU3Pwyp4bjlUpN05SMipmTOnL29ZMkna+0jzV062az6dJSuekQ0ZHiejpTEGRENSeKMiKgpiTMiogNJnH01Q/LMfgdR034HHNDvEGq5c+ngPSVuEC8D2fuAwYp65cpnWLPGEz7kYmypOPtuJhveNzUIliwZrAeEv3riB8BMS8/rdwAdWLTk2V5n31tz5kx6m/44kjgjImpqPch405fEGRENScUZEdGB4UicgzVqHRExDaTijIiGpKseEVFTEmdERE1JnBERNSVxRkR0IIkzIqKGVJwRETUlcUZE1JTEGRFRUxJnREQHkjgjImpIxRkRUVMSZ0RETcOTOHv2dCRJcyXdJWmFpFN6dd6I6KWnay6DqScVp6QZwKeAw4ER4CZJl9q+oxfnj4heGJ6Ks1dd9QOBFbZ/BCBpIXAUkMQZsckYnsTZq676LOCettcjpW0DkhZIWiJpyeAW8RGxqetVxTnWxyl6owb7LOAsgC2kjd6PiOlseCrOXiXOETb8tN/ZwH09OndE9EQSZ9NuAvaStAdwLzAP+OMenTsieiKJs1G210s6EbgSmAGcY3tZL84dEb2UxNko218Hvt6r80VErw1PxZmPB46IhrQSZ51lYpJ2k3SNpOWSlkk6qbS/V9K9km4pyx+07XNqudHmLklHtLUfIOm28t4ZklTaN5d0YWlfLGn3yeLKLZcR0ZCuVJzrgZNt3yxpa2CppEXlvY/b/lj7xpL2oZpD2RfYFfimpL1tPw2cCSwAbqDq/c4FLgeOAx62vaekecBHgTdOFFQqzohoSPMVp+1Vtm8u62uB5YxxDXibo4CFttfZvhtYARwoaRdgG9vX2zZwPnB02z7nlfWLgcNa1eh4kjgjokHNJs52pQv9SmBxaTpR0q2SzpH0gtI23s02s8r66PYN9rG9HngUeOFEsSRxRkRDOqo4t2/dLViWBWMdWdJWwJeAd9h+jKrb/RJgf2AVcFpr03ECm+gmnCndoNMuY5wR0ZCOxjjX2J4z0QaSZlIlzS/a/jKA7Qfa3v8scFl5Od7NNiNlfXR7+z4jkjYDng88NFFMqTgjoiFdmVUXcDaw3Pbpbe27tG32OuD2sn4pMK/MlO8B7AXcaHsVsFbSQeWYxwKXtO0zv6y/Abi6jIOOKxVnRDSkK7PqBwNvAW6TdEtpexfwJkn7l5OuBN4OYHuZpIuonry2HjihzKgDHA+cC2xJNZt+eWk/G/i8pBVUlea8yYJK4oyIBjX7XDPb1zH2GOS4N9PY/iDwwTHalwD7jdH+JHBMnbiSOCOiIcNz51ASZ0Q0ZHgSZyaHIiJqSsUZEQ0ZnooziTMiGpTEGRFRQyrOiIiakjj77ufAA5NuNb3MmviBKtPOI/0OYEjM0n/3O4RaftrxnkmcERH1eTg+2DuJMyKa80y/A+iNJM6IaIZp+o7LaSuJMyKakcQZEdGBdNUjImpIxRkR0YFUnBERNaTijIjoQBJnREQNJl31iIjaUnFGRNQwRGOceQJ8RERNqTgjojkZ44yIqGGIuupJnBHRnFScERE1pOKMiKhpiBJnT2bVJZ0jabWk23txvojok2dqLgOqV5cjnQvM7dG5IqIfWhVnnWVA9aSrbvvbknbvxbkioo8GOBnWkTHOiGhG7lXvD0kLgAUAg/VBuxEBpOLsB9tnAWcBzJDc53Aioo5UnBERHRiSirNXlyNdAFwPvFTSiKTjenHeiOihzKo3y/abenGeiOizIemq57FyEdGMLlScknaTdI2k5ZKWSTqptG8naZGkH5SvL2jb51RJKyTdJemItvYDJN1W3jtDkkr75pIuLO2Lp3LpZBJnRDSn+a76euBk2y8DDgJOkLQPcApwle29gKvKa8p784B9qW66+bSkGeVYZ1JdtbNXWVo35RwHPGx7T+DjwEcnCyqJMyKmLdurbN9c1tcCy4FZwFHAeWWz84Cjy/pRwELb62zfDawADpS0C7CN7ettGzh/1D6tY10MHNaqRseTxBkRzWhdjlTvXvXtJS1pWxaMd/jShX4lsBjYyfYqqJIrsGPZbBZwT9tuI6VtVlkf3b7BPrbXA48CL5zoW83lSBHRnPoz5Wtsz5lsI0lbAV8C3mH7sQkKwrHe8ATtE+0zrlScEdGMzirOSUmaSZU0v2j7y6X5gdL9pnxdXdpHgN3adp8N3FfaZ4/RvsE+kjYDng88NFFMSZwR0ZzmZ9UFnA0st31621uXAvPL+nzgkrb2eWWmfA+qSaAbS3d+raSDyjGPHbVP61hvAK4u46DjSlc9IprRnQcZHwy8BbhN0i2l7V3AR4CLys00PwGOAbC9TNJFwB1UM/In2G5FdTzVIy63BC4vC1SJ+fOSVlBVmvMmC0qTJNa+mSH5ef0Ooqat+x1ATY/0O4AhsW2/A6jpp8BTdu3n7Mx5ibzkQ/X20TyWTmWMc7pJxRkRzRiij85I4oyIZiRxRkR0YEjuVU/ijIhmpOKsSPqHqRzE9j82E05EDLRUnAD8dtu6gP8J3A/8GHgxsDPwre6EFhEDJRVnxfbhrXVJpwNXAx9uXRwq6VRg+65GGBGDI4lzI8cCO4+6ov6fqSrQkxuNKiIGTz5zaEw/A/YDbmlr+3XgyUYjajNo/3lt1+8AauraLy428OJ+B1DTo89m50H7o+1QncT5aeAKSZ8BVgK7Uz0U9JPNhxURAycV58Zsf1jSCNV9o8cA9wLvtH1+t4KLiJiOal3HafvzwOe7FEtEDLp01Tcm6Vepnhyyq+0TJe0NzLS9rCvRRcTgGKLLkab8PE5JhwPfo/rApGNL8w7Ax7oQV0QMoi48yHg6qlNxfgQ4xvYVkh4ubTcDv9F8WBExcIao4qyTOF9i+4qybgDbPyuPtY+IYTdEibPOR2fcI2m/9gZJr6C6NCkiYmi66nUS5xnAlyW9GZgh6X8DX6D6APeIGHatirPBzxyarupcx/nZ8iFH7wRmAO8DPlEuUYqIGOgqso6613GeBZzVpVgiYpBljHNjkpaP035bc+FExEBLV30js2u2R8Qwyb3qvyTpXa1t29Zb9gTuaTyqiBhMA1xF1jGVirP1MOOZbetQ/d9yP/BnTQcVEQNoiMY4J02ctn8HQNInbf9l90OKiIE1JF31WtdxStq5vUHSTpL2bDimiBhEQ3QdZ53E+e9s/PlCO5T2iIjcOTSGvW3fPqptGbD3ZDtK2k3SNZKWS1om6aRaUUZETCN1Lkd6RNL2tte0tW0PPDGFfdcDJ9u+WdLWwFJJi2zfUSfYiJjGhmhyqE7FuQg4U9JWAOXrJ4FvTLaj7VW2by7ra4HlwKz64UbEtDYkY5x1Ks5TgEuBByWtBnYElgKvrXNCSbsDrwQWj/HeAqoPgEN1DhoR/ZcL4Ddme42kg4HfpPrE05XAklGfsz6hUqV+CXiH7cfGOMcv7oWfIU35uBExTQxwFVlH3Yd8GLixLLWUBx5/Cfii7S/X3T8iprkhGuOcMHFKOsP2X5X1cZ+KZHvBJMcRcDaw3PbpnQQaEQNgSLrqk00OzRy1Pt4ymYOpPo/9UEm3lOUPOog3IqarLlwAL+kcSasl3d7W9l5J946VSySdKmmFpLskHdHWfoCk28p7Z5RiDkmbS7qwtC8uczCTmrDitH182/qfTuWA4xznOjLfE7Fp687k0LnAvwLnj2r/uO0NPmFX0j5UH1++L7Ar8E1Je9t+GjiTauL5BuDrwFzgcuA44GHbe0qaB3wUeONkQdW5HCkiYmINV5y2vw08NMWzHwUstL3O9t3ACuBASbsA29i+vszTnA8c3bbPeWX9YuCwVjU6kQkTp6RnJD092TLFbyoiNmWdddW3l7SkbZlwvqTNiZJuLV35F5S2WWz4mMuR0jarrI9u32Af2+uBR4EXTnbyyWbVf7ttfQ7w58BpwN3ArwLvAD4z2UkiYkjU76qvsT2n5j5nAu+nStXvp8pJf8bYw4GeoJ1J3hvXZGOc32mtS/pX4EjbPyxNV0m6mqq8PWOyE0XEJq5HlyPZfqC1LumzwGXl5QiwW9ums4H7SvvsMdrb9xmRtBnwfKYwNFBnjPMlbPy093upKs+IiJ7cclnGLFteB7Rm3C8F5pWZ8j2AvYAbba8C1ko6qIxfHgtc0rbP/LL+BuDqqdzUU+cC+KXAxyT9ne0nJW0BfAT4bo1jRMSmqguz6pIuAA6hGgsdAd4DHCJp/3LGlcDbAWwvk3QRcAfVg4VOKDPqAMdTzdBvSTWbfnlpPxv4vKQVVJXmvCnFNdU7JssDi78G7A607lX/MfBa29+f0kFqmCF5i6YP2mWDVnrfN/km0YCX9juAmm4FHrdrXz44Z2t5yf719tF1LO1gjLPv6tyrvkLSfsBBVDNR9wI3tGX0iBhmecjH2Gw/Lem/gJ3LuEFExNCZ8uSQpK0knQ38jOrCUiQdLek93QouIgbMkDyPs86s+mnATlT3nT9V2m5iCrcnRcQQGKIPa6vTVT8S2Mf2oyrPyrR9r6RduxNaRAycjHFuRFTd9F82VA8mfrzRiCJiMA3R8zjrdNW/A5w6qu0vgWuaCyciBla66mM6meo2yzcDW0m6jepZnId1JbKIGDzpqm/I9k/KdZxHAntQXfx+me2fTbxnRAyFIeqqTylxlpvfHwR2sv2l7oY0uAbtTpwn+x3AkPhevwOo6Vn9u0jF+Uu210taQ9U1z99bRGxsiCrOOpND7wHOlDRr0i0jYjhlcmgj/wbMAN4k6RnaHvZp+7lNBxYRAyb3qm+oPBnpjcC2wA8n2TwihtUAV5F1TJo4Jb0euJCq2nwKeL3tr3c7sIgYMBnj3MC7gXcBW1ONc76rqxFFxOB6puYyoKaSOPcATrP9BHA6sGd3Q4qIgZQ7hzYww/YzALZ/LikTQRExtgGuIuuYSuJ8rqT27vkWo15j+0PNhhURA2eIxjinkjhvAA5ve7141GsDSZwRMTQmTZy2D+lBHBGxKUjFGRFRQy6Aj4joQCrOiIgaMjkUEdGBdNUjIuoZkoIziTMimjFEPfUkzohozpD01JM4I6IZqTgbJmkL4NvA5uWcF9t+Ty/OHRG9k4qzWeuAQ20/LmkmcJ2ky23f0KPzR0SXpeJsmG0Dj5eXM8vi8feIiEEzTImzzoe1PSuSZki6BVgNLLK9eIxtFkhaImlJsmrE4BmS5xj3LnHaftr2/sBs4EBJ+42xzVm259ieo14FFhGNGKLnGPcucbbYfgS4Fpjb63NHRHc1nTglnSNptaTb29q2k7RI0g/K1xe0vXeqpBWS7pJ0RFv7AZJuK++dIUmlfXNJF5b2xZJ2n8r32ZPEKWkHSduW9S2B3wXu7MW5I6I3Wg9Harirfi4bF1mnAFfZ3gu4qrxG0j7APGDfss+nJc0o+5wJLAD2KkvrmMcBD9veE/g48NGpBNWrinMX4BpJtwI3UY1xXtajc0fEgLL9beChUc1HAeeV9fOAo9vaF9peZ/tuYAXVsOAuwDa2ry8T1eeP2qd1rIuBw1rV6ER6Nat+K/DKXpwrIvqng3HL7SUtaXt9lu2zJtlnJ9urAGyvkrRjaZ9F9YkVLSOl7edlfXR7a597yrHWS3oUeCGwZqIAcudQRDSiw+cYr7E9p6EQxqoUPUH7RPtMqOeTQxGx6erRrPoDpftN+bq6tI8Au7VtNxu4r7TPHqN9g30kbQY8n42HBjaSxBkRjejh5UiXAvPL+nzgkrb2eWWmfA+qSaAbS7d+raSDyvjlsaP2aR3rDcDVZRx0QumqR0Rjmr6oXdIFwCFUY6EjwHuAjwAXSToO+AlwDIDtZZIuAu4A1gMn2G7l5+OpZui3BC4vC8DZwOclraCqNOdNKa4pJNe+mCF5i34HUdOgxftkvwOIaelJ4Gm79j0o+0peWHOfl8PSBsc4eyYVZ0Q0YpjuVU/ijIjGDPL953UkcUZEI1JxRkR0IBVnREQNqTgjIjqQxBkRUUOHt1wOpCTOiGhMKs6IiBoyxhkR0YFh6arnIR8RETWl4mzQi/sdQE139TuADszsdwAd+O1+B1DT/+twv3TVIyI6MCxd9STOiGhEKs6IiA4kcUZE1JAL4CMiOpCKMyKihoxxRkR0IF31iIgaUnFGRNSUyaGIiA6k4oyIqCFd9YiIDqSrHhFRQyrOiIgOJHFGRNQwTLPqeZBxRERNqTgjojHpqkdE1DBMXfWeJk5JM4AlwL22j+zluSOi+1JxdsdJwHJgmx6fNyK6bJguR+rZ5JCk2cBrgM/16pwR0VvP1FwGVS8rzk8AfwdsPd4GkhYACwDUo6AiohmpOBsm6Uhgte2lE21n+yzbc2zPSeKMGCytxFlnmQpJKyXdJukWSUtK23aSFkn6Qfn6grbtT5W0QtJdko5oaz+gHGeFpDMkdZxmetVVPxh4raSVwELgUElf6NG5I6JHuthV/x3b+9ueU16fAlxley/gqvIaSfsA84B9gbnAp8ukNMCZVD3avcoyt5PvEXqUOG2fanu27d2pvqmrbb+5F+eOiN7oVsU5jqOA88r6ecDRbe0Lba+zfTewAjhQ0i7ANravt23g/LZ9asudQxHRmA4qzu0lLWlbFoxxWAPfkLS07f2dbK8CKF93LO2zgHva9h0pbbPK+uj2jvT8Anjb1wLX9vq8EdFdHU4OrWnrfo/nYNv3SdoRWCTpzgm2HWvc0hO0dyR3DkVEY7oxq277vvJ1taSvAAcCD0jaxfaq0g1fXTYfAXZr2302cF9pnz1Ge0fSVY+IRrRuuWxyckjS8yRt3VoHfg+4HbgUmF82mw9cUtYvBeZJ2lzSHlSTQDeW7vxaSQeV2fRj2/apLRVnRDSmCxXnTsBXypVDmwH/bvsKSTcBF0k6DvgJcAyA7WWSLgLuANYDJ9huhXU8cC6wJXB5WTqSxBkRjejGBfC2fwS8Yoz2B4HDxtnng8AHx2hfAuzXRFxJnBHRmEG+jbKOJM6IaERuuYyIiHGl4oyIxqSrHhFRwzB11ZM4I6IxSZwRETXkM4ciIjqQijMiooaMcUZE1JSuekREB1JxRkTUkIozIqIDqTj77BlY89/w4y4centgTReOy3e7cdBK12LukkGLF7oY82XdOGh3f8Yv7mSnTA5NA7Z36MZxJS2ZwqP6p5VBi3nQ4oXBi3m6xpuuekREDak4IyI6kMS56Tqr3wF0YNBiHrR4YfBinnbxDtOsuqrPZo+IeHa2klz3cykWw9LpOFY7mTzIOCKipmHsqkdEFwxTV31oKk5JcyXdJWmFpFP6Hc9kJJ0jabWk2/sdy1RJ2k3SNZKWS1om6aR+xzQRSVtIulHS90q87+t3TFMlaYak70rq0mWinXm65jKohiJxSpoBfAr4fWAf4E2S9ulvVJM6F5jb7yBqWg+cbPtlwEHACdP857wOONT2K4D9gbmSDupzTFN1ErC830G0a12OlMS56TgQWGH7R7afAhYCR/U5pgnZ/jbwUL/jqMP2Kts3l/W1VH/Ys/ob1fhceby8nFmWaT9bKmk28Brgc/2OZbRnai6DalgS5yzgnrbXI0zjP+hNgaTdgVcCi/sbycRKl/cWYDWwyPa0jrf4BPB3TLPck4pz06Mx2qZ9ZTGoJG0FfAl4h+3H+h3PRGw/bXt/YDZwoKS6V9T0lKQjgdW2l/Y7ltGSODc9I8Buba9nA/f1KZZNmqSZVEnzi7a/3O94psr2I8C1TP9x5YOB10paSTXkdKikL/Q3pF9KV33TchOwl6Q9JD0XmAdc2ueYNjmSBJwNLLd9er/jmYykHSRtW9a3BH4XuLO/UU3M9qm2Z9venerf8dW239znsIBUnJsc2+uBE4ErqSYsLrK9rL9RTUzSBcD1wEsljUg6rt8xTcHBwFuoqqBbyvIH/Q5qArsA10i6leo/10W2p9XlPYNmWCrO3HIZEY14ruSda+5zz4Decpk7hyKiMYPc/a4jiTMiGjFMt1wmcUZEY1JxRkTUkCfAR0R0IF31iD6SdC6w3vZb+x1LTM0wVZxDcR1nTJ2kd0uypGNr7GNJv9XNuCKmk1Sc8QuSngMcR/VUprcD5/c3ohgkz8CVa6vPe6+jW58N31VJnNHuCKr7+I8GLlIKVcoAAAHTSURBVJO0n+3bASS9HPgn4ABgBtWFy4dL+l7Z9xuSngEW2n5ruZf63ba/UPbfHbgb2M32iKTDgA8Be1M9x/Mq4K9sr+7NtxpNsz3d7/NvTLrq0e7twOW2/xP4HrAAQNIuwLfKsjuwM/BRgPIQYIDfs71VjTHJdVS3we4A/DqwK/AvzXwbEd2VxBkASNqV6uG455Smc4C3lIdfvIXqQdAftv2E7adsf/PZnM/2dbZvsr3e9v1U1exhz+aYEb2SxBktrbHN1kMuvgBsCbyRqsr8fpMnk3SApCsl3S/pMeACquozYtpL4ozWpNBbgW2BEUn3A3dQjWUuAFYCe01wiLGeFPM48Ly217uOen8hcDOwt+1tgDd1FHxEHyRxBlQP750NvJrqQ8tay2uAV1F9/MVLJb1T0q9Imlkmd1ruZ+PEuoTqQ/G2krQD8Pej3t8GeBRYK+lFwLT/5NGIliTOgGpS6Ku2l9q+v235BtUzQY8BDgEOp3qa/gPAO9v2/7/AP0p6WNJnStu7qa6HXkX1ZPWFo865gKrKXQt8GfiPbnxjEd2Q53FGRNSUijMioqYkzoiImpI4IyJqSuKMiKgpiTMioqYkzoiImpI4IyJqSuKMiKjp/wMtX5EUCzmbdQAAAABJRU5ErkJggg==",
            "text/plain": [
              "<Figure size 360x360 with 2 Axes>"
            ]
          },
          "metadata": {
            "needs_background": "light"
          },
          "output_type": "display_data"
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "loss: 1.3856678911868263, accuracy: 0.11747619720965312, f1: 0.07200591838594485, precision: 0.27436380889422163, recall: 0.3566003931926984\n",
            "\n",
            "New checkpoint\n",
            "\n",
            "train mode | time: 09:00:38\n"
          ]
        },
        {
          "data": {
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAUgAAAEnCAYAAADLttq8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3de7hcdX3v8ffHcIvSFGi4xASFalCBFmzSNErbY0VLWimhnkONfZDoQWMpKvZQK3hsqa0obZUiPcJjBCUUS0y9gSiXNIB9tNzCxUK4lFQQIoEYFImogcDn/LF+W4bNzOyZzZqZvWd/Xs+znr3mN+u31i+BfPfvvmSbiIh4tucNugARERNVAmRERAsJkBERLSRARkS0kAAZEdHCdoMuQEQMh0WLFnnz5s1d5bnxxhsvt72oR0V6zhIgI6IWmzdvZu3atV3lkTSzR8WpRQJkRNTEwLZBF6JWCZARUaMEyIiIJlKDjIhoIQEyIqKFBMiIiBaGL0BmoniPSXqjpCslPSJpq6T/kvThXk1vkHSIpJsk/UxSbVs1SfprSd1NcpvAJC2TdGQX158nqbs5LFPOSIDs5pjYUoPsIUkfB94LfBb4R+BRYH/gT4ADgD/swWM/BWwCDgO21njfc4Cv1ni/QVsG3AZ8pcPr/xaY3rviDIuJH/S6kQDZI5L+APg/wLG2P9Pw1TckLQd+t0ePfjmw3PY36ryp7Q3AhjrvORlImm77p7b/e9BlmfgMPDnoQtQqTeze+TPgplHBEQDbT9q+dOSzpJmSVkh6WNJPJF0taX5jHkn3SvqYpD+TtEHSDyWtlLRL+f41pUk9DfiEJEs6r3xnSe8adb9nNJkl7SLpHEkPlOb5fZI+3er6kravpK9IelTSFklflfTSUddY0gmSPiLp+5I2SfqkpB3b/eWNNGklvUHS7eXv5WuSdpP0UklXSXqsXPOro/KeKOkGST+S9NDockm6GpgHLC3ls6S3Nvw9f1zSX0raQFXrf1YTW9Ilku6UNH3Uc38m6YB2f7bhNXxN7ATIHpC0PfBq4LIOs3yFqkn858CbqP67XDU62AB/BBxK1Tx8P3A48JHy3U3Aq8r5x8v533ZR7NOB36QK7IcBH6D6P76pEuDWAK8A3gG8FdiXqoa826jLTwReCBwN/APwTuCEDsr0IuBvgA9S/ZlfDSwHVpbjf1G1glZKUkO+OcD/AxaXsk0DviXpF8v3fwrcCXyd6u/pVcDXGvL/MfA/ynVvalG2dwC7Ax8FkPQK4MPAKbbXdfBnG0LDFyDTxO6NXwJ2BO4b60JJi4BDgNeMNIslXQncC7yPKpiMeAI40va2ct3+wBLgT20/Clxb4sS9tq/tsswLgE/a/nxD2gVtrn8bVQDbz/Z3SnmuA75TyvzRhmvvtf3Wcn65pEOANwJ/P0aZdgNeNdK8LTXF9wFLbZ9f0kQV3F4O3AFg+89GbiBpGrCaql92MXC+7dslPQZ8v83f0+G2f9aqYLY3llr55yR9tfx5bwY+NsafKSaRBMje6mQUeQHVP9Sf9xnafkzSJVQ1ukZXjQTH4nZgD0k72H78OZb1FuB9kp4E/s32f3VQ7ptGgmMp9wZJ32pS7itGfb4dmM/Y7h3V97e+/LyySdpsSoCUtJCq9vxrVEF2xH4dPBNgTbvgOML2hZLeSBWgnwIOsj1cnXBdm/i1wm6kid0bD1ONIL+og2tnAQ81SX+IZ/7jBnhk1OfHAQE7dFvAJt5F1dT/K+AuSXdLWtLm+uda7p06KFOzfKPTR9J2ApD0IqqALKqa7CHAr1PVIDt5JjT/c7VyIVVrYbXtu7vIN4SGr4mdANkDtp8AvkXVlzeWjcAeTdL3BH5QU5G28uwg+owgZvsR2++xvRdwEHAdVfNx/xb37Ee5x2MR8Hxgse0v2P4Pqtrx6KDdTkfzRyXNoJq+dTNwhKRO/nsPsQTI6NwZwHxJS0d/Iel5pe8RqkC0h6Tfbvj++cAbgG/WVJYNVIMpP38+8NpWF9v+T6q+vudR9e01cx0wT9K+DfedTTWQUle5x2M6VXO38V/fH/Hs7qROa7HtnEE1APRa4F+AcxoGgqag4QuQ6YPsEdtflXQ6cG4ZlLgI+DFVwPkTqkGYy2xfXvrtPi/pJKrm+Z9T/UP/h5qK82XgeEk3Uw2ivB2Y0XiBpG+W626j+j/9HcBjwPUt7nke1Uj6pZL+imoC3F8Dm6kmqw/KlVRB67OSzqWakP/nPLu5fidwWKn1PQzcY/vhTh8i6XCqgarfs/2IpHdT/d19gmpEfwrKUsPogu0TqaaJzKWqYaymmvKyBjiu4dI/LN+dAfwrVf/Za22vpx4fKvf9MFVguwUYPT/zGqp/2F8AVgEzqf7xN50cbnsr8DqqQHMusAL4LtVo/MCa2LZvpQpcvwFcQjVl5yjgR6Mu/TDVoM4q4AbgDzp9RpnGtBz4tO3LynN/QPVLZamqRQJT1HDVIGXXtlw3Iqaw+fP389q1/9RVHmnRjbY7mdEwEKlBRkRNetMHWVaPrZN0m6QLJe1UVlStLrMtVkvateH6kyWtl3RX48CZpHmSbi3fnTlqcUFTCZARUZP6A2QZ+HsPMN/2gVT9y0uAk6jmq86l6rI6qVw/snjiAKoZDWeVxQIAZ1OtyJpbjjHfppgAGRE16dko9nbAdEnbUU3heoBqVdSK8v0KYGTrusXASttbbd9DtZBggaRZwAzb17jqVzy/IU9LCZARUaOuA+TMsuHIyLGs8W62v0e1fPM+qrm3P7J9BbCn7Y3lmsY5ubOB+xtusaGkzeaZu1GNpLc1Yaf5bCe5juUh/TRj7EsmlI7ntEwgTw26AOPwynnzBl2Ertx7771s3rx5zP65ZxvXNJ/N7QZpSt/iYqqNUB4B/lXS0W3u16zcbpPe1oQNkDvQeobyRPX6QRegS58ddAHG4bFBF2Ac1q6dXBuRz58/3kHlnsyDfB3VHNXvA0j6EtVihIckzSqbhsyiWkoKVc1w74b8c6ia5BvK+ej0ttLEjoiajGyY280xpvuAhZKeX0adD6Wav3oxMLJKbSnVQgxK+hJJO5ZVXnOB60szfIukheU+xzTkaWnC1iAjYrKpvwZp+zpJX6Da73Qb1br35cDOwCpJx1IF0aPK9eskraLaMWobcHzDDkvHUS2UmA5cWo62EiAjokb1r46xfQpwyqjkrVS1yWbXnwqc2iR9LXBgN89OEzsiooXUICOiJsO3WUUCZETUJAEyIqKFBMiIiBYSICMi2kiAjIhoIjXIiIgWEiAjIlpIgIyIaCEBMiKijQTIiIgmUoOMiGghATIiooXhC5B9281H0qLyGsb1kk7q13Mjop9q3zB3oPpSgyyvXfwk1VsJNgA3SLrY9u39eH5E9MPw1SD71cReAKy3/R0ASSupXsSTABkxNIYvQParid3qVYzPIGnZyOsfh+uvOSImo37VIDt65aLt5VTvm+D50pivZIyIiWT4apD9CpCtXsUYEUMjAXK8bgDmltcwfg9YAvxxn54dEX2RADkutrdJehdwOTAN+Iztdf14dkT003AFyL7Ng7T9ddv72X5JeS1jRAyVkRpkN0d7kl4m6ZaG41FJ75W0m6TVku4uP3dtyHNymW99l6TDGtLnSbq1fHempGZjI8+Q175GRE3qD5C277J9sO2DgXnAT4AvAycBa2zPBdaUz0jan6oL7wBgEXBWmYcNcDawDJhbjkVjPT8BMiJqUn+AHOVQ4L9tf5dqHvWKkr4COLKcLwZW2t5q+x5gPbBA0ixghu1rbBs4vyFPS1mLHRE1GdcgzUxJaxs+Ly/T/ZpZAlxYzve0vRHA9kZJe5T02cC1DXlG5lw/Uc5Hp7eVABkRNeo6QG62PX+siyTtABwBnDzWpU3S3Ca9rQTIiKhJT6f5/B5wk+2HyueHJM0qtcdZwKaS3mrO9YZyPjq9rfRBRkRNetoH+Waebl4DXAwsLedLgYsa0pdI2rHMu54LXF+a41skLSyj18c05GkpNciIqElvapCSnk+1E9g7G5JPA1ZJOha4DzgKwPY6SauoNsLZBhxve2RfteOA84DpwKXlaCsBMiJq0psAafsnwC+NSnuYalS72fWnAs+aa217LXBgN89OgIyIGk38TXC7kQAZETXJWuyIiBaGL0BmFDsiooXUICOiJsNXg0yAjIgaJUBGRDSRGmRERAsJkH2zlWqfosnkrkEXICakvcfel3VCeWjsS1pIgIyIaM2ZKB4R0dxTgy5AvRIgI6IeZthWGiZARkRNEiAjItpIEzsioonUICMi2kgNMiKiidQgIyLaSICMiGjCpIkdEdFSapAREU0MYR9kdhSPiGghATIi6vNUl0cHJO0i6QuS7pR0h6RXSdpN0mpJd5efuzZcf7Kk9ZLuknRYQ/o8SbeW786Uxt5mKQEyIuox0sTu5ujMJ4DLbL8cOAi4AzgJWGN7LrCmfEbS/sAS4ABgEXCWpGnlPmcDy4C55Vg01oMTICOiPjXXICXNAH4bOBfA9uO2HwEWAyvKZSuAI8v5YmCl7a2276HaVnaBpFnADNvX2DZwfkOelhIgI6Ie46tBzpS0tuFYNuquvwx8H/ispJslnSPpBcCetjcClJ97lOtnA/c35N9Q0maX89HpbWUUOyLqMb5R7M2257f5fjvg14B3275O0icozekWmvUruk16W32pQUr6jKRNkm7rx/MiYkDqH6TZAGywfV35/AWqgPlQaTZTfm5quH7vhvxzgAdK+pwm6W31q4l9Hh10iEbEJNaDQRrbDwL3S3pZSToUuB24GFha0pYCF5Xzi4ElknaUtC/VYMz1pRm+RdLCMnp9TEOelvrSxLb975L26cezImKAejNR/N3A5yTtAHwHeBtV5W6VpGOB+4CjAGyvk7SKKohuA463f/6inOOoKmvTgUvL0Vb6ICOiHj1ai237FqBZP+WhLa4/FTi1Sfpa4MBunj2hAmQZwVoGzXtUI2KCG7KlhhMqQNpeDiwHmCaNOcIUERNIdvOJiGhjyGqQ/ZrmcyFwDfAySRtKx2pEDJPeLTUcmH6NYr+5H8+JiAFLEzsiookh3A8yATIi6jNkATKbVUREtJAaZETUI9N8IiLaGLImdgJkRNQjNciIiDZSg4yIaCLTfCIi2kgTOyKiidQgIyJaSICMiGgjTeyIiCamWg1S0l91chPbf1NPcSJiUptiNcjfajgX8NvAg8B3gRcDewHf6E3RImJSmWo1SNuvHzmXdDpwJfBR2y5pJwMze1rCiJg8plKAHOUYYK+R4Fj8A1WN8sRaSxURk88UX2r4U6pXJt7SkPYrwM9qLVEhJt9ebEP2yzNqMnfQBejSj55L5h78I5B0L7Cl3H2b7fmSdgM+D+wD3Av8ke0flutPBo4t17/H9uUlfR5Pvxf768AJoyp8z9JNDDoLuEzShyS9TdKHykPO6uIeETGsRmqQ3Ryd+x3bB9seeT/2ScAa23OBNeUzkvYHlgAHAIuAsyRNK3nOpnqt9NxyLBrroR0HSNsfBd4HvKr8fDXwftsf6fQeERE1WQysKOcrgCMb0lfa3mr7HmA9sEDSLGCG7WtKrfH8hjwtdTUP0vY/A//cTZ6ImEK6b2LPlLS24fNy28tHXWPgCkkGPlW+39P2RgDbGyXtUa6dDVzbkHdDSXuinI9Ob6urACnpl6mqry+0/S5J+wHb217XzX0iYgiNb5rP5oZmcyuH2H6gBMHVku5sc61alKxVelsdN7ElvR74NrCQakQbYHfgY53eIyKGXA/6IG0/UH5uAr4MLAAeKs1mys9N5fINwN4N2ecAD5T0OU3S2+pmkOY04CjbR/D074mbgF/r4h4RMaxGapDdHGOQ9AJJvzByDvwucBtwMbC0XLYUuKicXwwskbSjpH2pBmOuL83xLZIWShJVJe8ixtBNE/slti8r5waw/VNJ23dxj4gYVr1ZSbMn8OUqprEd8C+2L5N0A7BK0rHAfcBRALbXSVoF3A5sA463PVKq43h6ms+l5WirmwB5v6QDbd82kiDpIKo5SBERtU8Ut/0d4KAm6Q8Dh7bIcypwapP0tVRzuTvWTRP7TOBLko4Gpkn6n8AFwD9288CIGFI9aGIPWsc1SNufLm339wPTgA8BZ5SpPxERU3qpIWX+0eg5ShERQ7mbTzfTfO5okX5rfcWJiEltqjaxeeYcok7SI2IqmYq7+Uj6wMi1DecjXgrcX3upImJymgS1wm50UoMc2TR3+4ZzqH5XPAj877oLFRGT0BD2QY4ZIG3/DoCkf7L97t4XKSImrSFrYnc1D1LSXo0JkvaU9NKayxQRk9EQzoPsJkD+C89+/8zuJT0iopcb5g5ENwFyv8ZlhsU6YL+xMkraW9JVku6QtE7SCV2VMiJiALqZ5vOIpJm2NzekzQQe6yDvNuBE2zeVnTlulLTa9u3dFDYiJrAhHKTppga5Gjhb0s4A5ec/AVeMldH2Rts3lfMtwB10sJtvREwyQ9YH2U0N8iSqvdYelrQJ2AO4ETiimwdK2gd4JXBdk++WUb1Up+n2vxExgU3FieIjbG+WdAjw68CLqbY5WzvWaxMblVrnF4H32n60yTN+vtZ7u+r9ExExmUyCWmE3ut2swsD15ehK2Vj3i8DnbH+p2/wRMcENYR9k2wAp6Uzb7ynnLXfxsb1sjPsIOBe4w/bp4yloREwCU6yJvX2L824dArwFuFXSLSXtA7a//hzuGRETyVSrQdo+ruH8beN9iO1vknGXiOE2lQdpIiLGNJVqkJKeooOXa9ueVluJImJymmpNbOC3Gs7nA38CfBy4B/hl4L3Ap3pTtIiYdIasid12JY3tb40cwFuBw22fY3uN7U9TTRIfd99kRAyRHu7mI2mapJslXVI+7yZptaS7y89dG649WdJ6SXdJOqwhfZ6kW8t3Z5bZNW11s9TwJTx79/DvUdUkIyJ6udTwBKolyiNOAtbYngusKZ+RtD+wBDgAWAScJWmkC/BsqpV6c8uxaKyHdhMgbwQ+JmmnUpCdgNOAm7u4R0QMq5FR7Jq3O5M0B3gDcE5D8mJgRTlfARzZkL7S9lbb9wDrgQWSZgEzbF9TFryc35CnpW5Gsd8BfBX4YcNa7O/S5VrsiBhi3Q/SzJS0tuHz8rLkuNEZwF8Av9CQtqftjVBthiNpj5I+G7i24boNJe2Jcj46va1u1mKvl3QgsLDc+HvAtbaHbNwqIsZlfPMgN9ue3+pLSYcDm2zfKOk1HdyvWb+i26S31e1a7Ccl/Qew10j0jojooUOAIyT9PrATMEPSBcBDkmaV2uMsYFO5fgOwd0P+OcADJX1Ok/S2Ou6DlLSzpHOBn1K165F0pKRTOr1HRAy5mgdpbJ9se47tfagGX660fTTV1otLy2VLgYvK+cXAEkk7StqXajDm+lKh2yJpYRm9PqYhT0vdDNJ8HNiTKqI/XtJuAN7UxT0iYlj196VdpwGvl3Q31euoTwOwvQ5YBdwOXAYc39ANeBzVQM964L+BS8d6iDrdzlHS94D9bf9I0g9s71bSH7G9Szd/sk5sJ3nnum/aY08MugAxIf3GoAvQpbXAo3bXeyfMf4G89uXd5dFN3NiuD3LQuumDFFXz+umEagPcH9daooiYnIZwqWE3TexvASePSns3cFV9xYmISWsI34vdTQ3yRGCNpKOBnSXdSrVH5KE9KVlETD5Dtha7m3mQ95V5kIcD+1JNEr/E9k/b54yIKWEIm9gdBUhJ2wEPU81e/2Jvi1QxGfSI4bB27EsmlE5edN/SVKxB2t4maTNVk/pnvS1SRExKQ1iD7GaQ5hTgbEljrl+MiClqCg/SfBaYBrx59E7jtneou2ARMclM1XfSSHop1YqZXahmoEdEPNskqBV2Y8wAKemNwOepao+PA2/M61oj4lmmaB/kB4EPUO3Fdko5j4h4th5smDtInQTIfYGP234MOB14aW+LFBGT0hRdSTPN9lMAtp+QlAGZiGhuEtQKu9FJgNxBUmOzeqdRn7H9kXqLFRGTzhD2QXYSIK+l2m9txHWjPhtIgIyIoTNmgLT9mj6UIyKGwRSsQUZEjG2qThSPiOhIapAREU1M0UGaiIjOpIkdEdHckFUgEyAjoh5D2MLuaj/IiIi26l6KLWknSddL+rakdZI+VNJ3k7Ra0t3l564NeU6WtF7SXZIOa0ifJ+nW8t2ZksZ8tW0CZETUokdLsbcCr7V9EHAwsEjSQuAkYI3tucCa8hlJ+wNLgAOARcBZkqaVe50NLAPmlmPRWA/vS4Bs9VsgIoZL3TVIV35cPm5fDgOLgRUlfQVwZDlfDKy0vdX2PcB6YIGkWcAM29fYNnB+Q56W+lWDbPVbICKGxDhrkDMlrW04lo2+r6Rpkm4BNgGrbV9H9QLBjQDl5x7l8tnA/Q3ZN5S02eV8dHpbfRmkKRG72W+BiBgS4xyk2Wx7ftv72k8CB0vaBfhyef10K836Fd0mva2+9UG2+C0w+pplI79JEj0jJp9e7pdr+xHgaqq+w4dKs5nyc1O5bAOwd0O2OcADJX1Ok/S2+hYgbT9p+2Cqgi1o9lvA9nLb823PH3N4KSImlF4M0kjavdQckTQdeB1wJ3AxsLRcthS4qJxfDCyRtKOkfakGY64vzfAtkhaW0etjGvK01Pd5kLYfkXQ11W+B2/r9/IjonR7Mg5wFrCgj0c8DVtm+RNI1wCpJxwL3AUcB2F4naRVwO7ANOL400QGOA84DpgOXlqMtVd2DvSVpd+CJEhynA1cAf2f7klZ5pkneqecli+i9aWNfMqE8Bjxpd92I+1XJX+syz4vgxrH6IAepXzXIpr8F+vTsiIhx6dco9n8Cr+zHsyJicIZtqWHWYkdELYZwv9wEyIioT2qQERFNDONuPgmQEVGbNLEjIppIDTIiooUEyIiINtLEjohoIjXIiIg2UoOMiGgiNciIiDYSICMimshSw4iINlKDjIhoIn2QERFtDFsTu2/vpImImGxSg4zoscMGXYAurRlnvjSxIyLaGLYmdgJkRNQiNciIiDaGLUBmkCYiajEyUbybYyyS9pZ0laQ7JK2TdEJJ303Sakl3l5+7NuQ5WdJ6SXdJOqwhfZ6kW8t3Z0oa89W2CZARUZsnuzw6sA040fYrgIXA8ZL2B04C1tieSzWudBJA+W4JcACwCDirvG4a4GxgGTC3HIvGengCZETUYqQPss4AaXuj7ZvK+RbgDmA2sBhYUS5bARxZzhcDK21vtX0PsB5YIGkWMMP2NbYNnN+Qp6X0QUZEbcYxij1T0tqGz8ttL292oaR9gFcC1wF72t4IVRCVtEe5bDZwbUO2DSXtiXI+Or2tBMiIqMU4R7E3254/1kWSdga+CLzX9qNtug+bfeE26W2liR0RtejFIA2ApO2pguPnbH+pJD9Ums2Un5tK+gZg74bsc4AHSvqcJultJUBGRG3q7oMsI83nAnfYPr3hq4uBpeV8KXBRQ/oSSTtK2pdqMOb60hzfImlhuecxDXlaShM7ImrRo4nihwBvAW6VdEtJ+wBwGrBK0rHAfcBRALbXSVoF3E41An687ZFiHQecB0wHLi1HWwmQEVGbupca2v4mzfsPAQ5tkedU4NQm6WuBA7t5fgJkRNQiSw0jItpIgIyIaGIY30mTUeyIiBZSg4yI2qSJHRHRxDA2sfsaIMuuGmuB79k+vJ/PjojeSw3yuTmBajeOGX1+bkT02DBO8+nbII2kOcAbgHP69cyI6K9erMUepH7WIM8A/gL4hVYXSFpGtaFly6nzETExpQY5TpIOBzbZvrHddbaX255ve34CZMTk0osNcwetXzXIQ4AjJP0+sBMwQ9IFto/u0/Mjog8mQ7O5G32pQdo+2fYc2/tQvS/iygTHiOGSGmRERBvDVoPse4C0fTVwdb+fGxG9NYyDNKlBRkRtEiAjIprIUsOIiDZSg4yIaCJ9kBERbaSJHRHRxDDWILOjeERECwmQEVGbunfzkfQZSZsk3daQtpuk1ZLuLj93bfjuZEnrJd0l6bCG9HmSbi3fnSmpo+0eEiAjohY9Wmp4HrBoVNpJwBrbc4E15TOS9qdaynxAyXNW2aQb4GyqncLmlmP0PZtKgIyI2tQdIG3/O/CDUcmLgRXlfAVwZEP6Sttbbd8DrAcWSJoFzLB9jW0D5zfkaSuDNBFRi3FOFJ8paW3D5+W2l4+RZ0/bGwFsb5S0R0mfDVzbcN2GkvZEOR+dPqYEyIiozThGsTfbnl/T45v1K7pN+pgSICOiFn2c5vOQpFml9jgL2FTSNwB7N1w3B3igpM9pkj6m9EFGRC1Gmth9eCfNxcDScr4UuKghfYmkHSXtSzUYc31pjm+RtLCMXh/TkKet1CAjojZ11yAlXQi8hqqvcgNwCnAasErSscB9wFEAttdJWgXcDmwDjrc9UqTjqEbEpwOXlmPs51eDOhPPNMk7DboQETX4/UEXoEtrgB/YXb8W6hcl/2aXeb4ON9bYB1m71CAjojbDttRwwgbIp2DzT+C7Pbj1TGBzD+7bS5OtzJOtvNDDMn+hFzft7d/xi8eTaRjXYk/YAGl7917cV9LaiVylb2aylXmylRcmX5knanmzm09ERBOpQUZEtJEAOfmNtYxpIppsZZ5s5YXJV+YJV95hfCfNhJ3mExGTy86SD+wyz3UTfJpPVtJERLQwFZvYEdEDw9jEnjI1SEmLyi7D6yWdNOjyjKXZTsoTnaS9JV0l6Q5J6ySdMOgytSNpJ0nXS/p2Ke+HBl2mTkmaJulmSZcMuiyNerBh7kBNiQBZdhX+JPB7wP7Am8vuwxPZeXS46/EEsg040fYrgIXA8RP873kr8FrbBwEHA4skLRxwmTp1AnDHoAvRqEc7ig/UlAiQwAJgve3v2H4cWEm1+/CE1WIn5QnN9kbbN5XzLVT/gDvamHQQXPlx+bh9OSb8qKWkOcAbgHMGXZbR+rSbT99MlQA5G7i/4XPHOwrH+EjaB3glcN1gS9JeaareQrWn4GrbE7q8xRnAXzDBYkxqkJPXuHcUju5J2hn4IvBe248Oujzt2H7S9sFUm6gukNTtTJW+knQ4sMn2jYMuy2gJkJNXq52Go2aStqcKjp+z/aVBl6dTth8Brmbi9/seAhwh6V6qrqLXSrpgsDsyT7sAAAMoSURBVEV6WprYk9MNwFxJ+0ragerVkBcPuExDp+zWfC5wh+3TB12esUjaXdIu5Xw68DrgzsGWqj3bJ9ueY3sfqv+Pr7R99ICLBaQGOWnZ3ga8C7icauBgle11gy1Ve2Un5WuAl0naUHZPnugOAd5CVau5pRwTeb/YWcBVkv6T6pfoatsTatrMZDNsNcgsNYyIWuwgea8u89w/wZcaZiVNRNRmMjSbu5EAGRG1GMalhgmQEVGb1CAjIpoYxh3Fp8QodkT0Ry9GsQe50UxqkDEhSToP2Gb77YMuS3SmFzXIho1mXk+14OMGSRfbvr3mRzWVGmQ8g6QPSrKkY7rIY0ndvjM+ohMD3WgmNcj4OUnPA46l2kXoncD5gy1RTCZPweVbqvd1d2MnSWsbPi+33fi+nWYbzfzGeMvYrQTIaHQY1Tr1I4FLJB1o+zYASb8K/D0wD5hGNcH39ZK+XfJeIekpYKXtt5e1wh+0fUHJvw9wD7C37Q2SDgU+AuxHtY/kGuA9tjf1548adbPdi3XsA91oJk3saPRO4FLbXwO+DSwDkDQL+EY59gH2Av4OoGw2C/C7tnfuos9wK9Xyz92BXwFeCHyinj9GDJGBbjSTABkASHoh1SasnylJnwHeUjZxeAtVP9BHbT9m+3Hb//Zcnmf7m7ZvsL3N9oNUtdNDn8s9YygNdKOZBMgYMdL3OLJZwwXAdOBNVLXG/6rzYZLmSbpc0oOSHgUupKpNRvzcoDeaSYCMkcGZtwO7ABskPQjcTtXXuAy4F5jb5hbN+oR+DLyg4fMLR32/ErgJ2M/2DODN4yp8DD3bX7e9n+2X2D61n89OgAyoNomdA7ya6uVVI8cbgFdRvTbhZZLeL+n5krYvgywjHuTZAXQt1cvRdpa0O/CXo76fAfwI2CLpRcCEf9NkTD0JkAHV4MxXbN9o+8GG4wqqPSmPAl7D05N1HwLe35D//wJ/I+mHkj5V0j5INW94I9VO3StHPXMZVa11C/Al4F978QeLeC6yH2RERAupQUZEtJAAGRHRQgJkREQLCZARES0kQEZEtJAAGRHRQgJkREQLCZARES38f0TnyDeH8MSaAAAAAElFTkSuQmCC",
            "text/plain": [
              "<Figure size 360x360 with 2 Axes>"
            ]
          },
          "metadata": {
            "needs_background": "light"
          },
          "output_type": "display_data"
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n",
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n",
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n",
            "/anaconda/envs/azureml_py36/lib/python3.6/site-packages/ipykernel_launcher.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)\n",
            "  if __name__ == '__main__':\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "loss: 1.3225790266836843, accuracy: 0.333766129032258, f1: 0.16503508884653428, precision: 0.34857432644627095, recall: 0.4273865645595446\n",
            "val mode | time: 09:01:09\n"
          ]
        },
        {
          "data": {
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAU4AAAErCAYAAACxamqAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3de7hdVX3u8e9LiCEWwy2AMUFDTVCBI+EkTaNYS4lKVCpooUaPEms0iqDoQQt4rDfESxVQrPAYhSagElK8gAhiSlAfLAR2uAgJULcSYUMghmugEgn8zh9jLFzZ2XvtNRdz3fZ6P88znz3XWHPMOXYgv4wxx00RgZmZ1W+7dhfAzKzbOHCamRXkwGlmVpADp5lZQQ6cZmYFOXCamRXkwGlmHUvSDpKuk3SzpDWSPpPTPy3pHkk35eMNVXlOltQv6Q5Jh1alz5R0S/7uTEnK6eMkXZjTV0maOlK5HDjNrJNtBg6JiAOAGcA8SXPyd2dExIx8XAYgaV9gPrAfMA84S9KYfP3ZwCJgej7m5fSFwEMRMQ04A/jSSIVy4DSzjhXJY/nj2HzUmrVzOLAsIjZHxJ1APzBb0iRgQkRcE2nWz3nAEVV5lubzi4C5ldrocLZv7NcxM9vavHnzYuPGjYXyrF69+oqImFfrmlxjXA1MA74REaskvR44TtLRQB9wQkQ8BEwGrq3KPpDTnszng9PJP+8GiIgtkh4BdgOG/WUcOM2sFBs3bqSvr69QHkkvlVSdaXFELK6+JiKeAmZI2hn4oaT9Sc3uU0i1z1OA04B3A0PVFKNGOiN8NyQHTjMrSQBbimbaGBGz6rp7xMOSfg7Mi4ivVNIlfQu4NH8cAPaqyjYFuDenTxkivTrPgKTtgZ2AB2uVxe84zaxEWwoetUnaPdc0kTQeeA1we35nWfFm4NZ8fgkwP/eU703qBLouItYDmyTNye8vjwYursqzIJ8fCayMEVY/co3TzErSUI1zJJOApfk953bA8oi4VNL5kmbkh64D3gcQEWskLQfW5sIcm5v6AMcAS4DxwOX5ADgHOF9SP6mmOX+kQsnLyplZGWbNOjD6+q4qlEfaZXW9TfVO4hqnmZWkKTXOjuR3nE0m6S2SVkp6WNJmSf8t6XOSJjbpeQdJukHSE5JKa07kmRrFxpp0MEmLJB0x8pXPXL9kUO+vbaMSOMt7x9mpXONsIkmnAR8G/p00I+FRYF/g/aSZDW9uwmO/CWwADiXNuijLt4Efl3i/dltE6lD4UZ3Xn0J6N2bD6p0apwNnk0j6e+D/Agsj4tyqr34haTHwuiY9+qWksXC/KPOmETHA1gOIe4Kk8RHxx4j4bbvL0h16I3C6qd48HwFuGBQ0gTSgNyIqPXpImihpqaQHJP2PpJ9L2uqFuaR1kr4i6SOSBiQ9JGlZ1VCNg3PTfAzwNUkhaUn+LiQdN+h+WzW9Je0s6duS7s3N/Lvy+Lghr89pe0v6kaRHJW2S9GNJ0wZdE5KOl/R5SX+QtEHSNySNq/WHV2kaS3qjpLX5z+UnknaVNE3SVZIez9e8fFDeEyRdL+kRSfcPLlceCzgTWJDLF5LeVfXnfJqkf5E0QGolbNNUl3SppNvzEJnq5z4hab9av9voFcBTBY/u5MDZBJLGAq8Eflpnlh+RmtYfBd5K+u9y1eAgBPwjMJfUzDwROAz4fP7uBuAV+fy0fH5KgWKfDryKFPAPBT5OjdkTOfBdCbwMeC/wLmBvUo1610GXnwC8AHgH8GXS0JHj6yjTC4HPAp8g/c6vBBYDy/JxJKnVtCyPzauYAvwbaQ7ye0n/mPxK0k75+w8AtwOXkf6cXgH8pCr/24G/zde9dZiyvRfYHfgCgKSXAZ8DPhURa+r43UYhv+O0Z2c3YBxw10gXSpoHHAQcXGleS1pJGpv2MfL4tOxJ4IiI2JKvq6wE84GIeBS4NsePdRFRPV+3HrNJ84AvrEr7To3r/4kU2PaJiN/l8qwCfpfL/IWqa9dFxLvy+RWSDgLeAvzrCGXaFXhFpZmca5YfAxZExHk5TaSg91LgNoCI+EjlBnn83wrSe9/DgfMiYq2kx4E/1PhzOiwinhiuYBGxPtfivyvpx/n3vRH4ynB5Rr/eecfpGmdz1dOrPZv0F/iZd5IR8ThpCtmrBl17VSVoZmuBPSQ951mXFG4CPibpA5L2qeP62aRXEb+rJOT3oL9i23L/bNDntWw9/W046wa9W+zPP1cOkVZZsAGl2SErJD1A+pv8P8COQD2/F8CVtYJmRURcAHyfFLj3JwX07m1/Wt0cOJvjAVKP9gvruHYScP8Q6feTalzVHh70+U+kBQrKCJzHkV4ZfBK4Q9JvJNWaQfFsy71DHWUaKt/g9EraDgCSXkgK1CLVfA8C/opU46znmTD07zWcC0itixUR8ZsC+Uap3miqO3A2QUQ8Sap5HTrStcB6YI8h0vdkhIUGCtjMtsF1q+AWEQ9HxIci4vnAAcAqUjN032Hu2YpyN2Ie8Fzg8Ii4KCL+i1SbHhzMa6lr/KukCaRhZjcCb1LVauO9qXfecTpwNs9XgVmSFgz+QtJ2+d0mpAC1h6RXV33/XOCNwNUllWWA1InzzPOBQ4a7OCJ+TXqXuB3p3eFQVgEzlRZSqNx3MqkDp6xyN2I88DRb/638R7Z9n19vrbeWr5I6ng4Bvgd8u6oDqgf1TuB051CTRMSPJZ0OnJM7Qy4GHiMFoveTOn9+GhFXSPoVcKGkk0jN/I+SAsCXSyrOD4FjJd1I6rx5DzCh+gJJV+frbiX9DXgv8Dhw3TD3XELq2b9c0idJY0s+TVr89ZsllbsRK0nB7N8lnUOaaPBRtm323w4cmmuJDwB3RsQD9T5E0mGkDrLX5+XOPkj6s/saaYRBD3LnkJUgIk4gDWeZTqqRrCANzbmStFJLxZvzd18F/oP0fu6QiOinHJ/J9/0cKeDdBAweX3oN6S/8RcByYCIpKAw56D0iNpOX+CKtLrMU+D1pdEDbmuoRcQspoP01qYPt7cBRwCODLv0cqRd+OXA98Pf1PiMPt1oMfCsifpqf+yDpH5sFSpMfelDv1Di9OpKZlWLWrJdEX99ZhfJIr/HqSGbWy3qnqe7AaWYlceA0MyvIgdPMrCAHTjOzBjhwttXE8YqpXTaU+P4iE/U6wH3tLkADnm53ARpw4MyZ7S5CIevWrWPjxo1D7TU+Atc4227qTtB3dLtLUcxXyhqu3iIjLU3UiR5vdwEa0NfXXTtuzJrV6OggB04zs4IqCxmPfg6cZlYS1zjNzBrQG4HTc9XNzApyjdPMSuKmuplZQQ6cZmYFOXCamRXkwGlm1oDeCJzuVTezkpS/ArykHSRdJ+lmSWskfSan75q3gP5N/rlLVZ6TJfVLuqN6Az1JMyXdkr87U5Jy+jhJF+b0VZKmjlQuB04zK0lTts7YTNpG5gBgBjBP0hzgJODKiJhO2ormJIC8K+t80l5T84CzJI3J9zobWETaymZ6/h5gIfBQREwj7Vr6pZEK5cBpZiUpP3BG8lj+ODYfARxO2ueK/POIfH44sCwiNkfEnUA/MFvSJGBCRFwTab+g8wblqdzrImBupTY6HAdOMytJczZrkzRG0k3ABmBFRKwC9oyI9QD55x758snA3VXZB3La5Hw+OH2rPBGxhbSx3261yuTOITMrUeHOoYmSqpePWhwRi6sviIingBmSdgZ+KGn/GvcbqqYYNdJr5RmWA6eZlaSh4Ugb693lMu9f/3PSu8n7JU2KiPW5Gb4hXzYA7FWVbQpwb06fMkR6dZ4BSdsDOwE1t7h2U93MStKUXvXdc00TSeOB1wC3A5cAC/JlC4CL8/klwPzcU743qRPoutyc3yRpTn5/efSgPJV7HQmsjBH2TW9ZjVPSPOBrwBjg2xHxxVY928xaoSkD4CcBS3PP+HbA8oi4VNI1wHJJC4G7gKMAImKNpOXA2lyYY3NTH+AYYAkwHrg8HwDnAOdL6ifVNOePVKiWBM78S38DeC2pWny9pEsiYm0rnm9mrVLuQsYR8WvgwCHSHwDmDpPnVODUIdL7gG3ej0bEE+TAW69W1ThnA/0R8TsASctIQwAcOM1GDU+5LNtQQwT+ukXPNrOW6J3A2arOobq6+yUtktQnqe8Pf2xBqczMGtCqGudwQwS2ksdvLQaY9XzV7NUys07TOzXOVgXO64HpeXjAPaReq7e36Nlm1hIOnKWKiC2SjgOuIA1HOjci1rTi2WbWKg6cpYuIy4DLWvU8M2sHB04zswJc4zQzK8iB08ysIAdOM7OCHDjNzBrgwGlmVoBrnGZmBTlwmpkV5MBpZlaQA6eZWQPKXci4UzlwmllJXOM0MyuodwKnd7k0MyvINU4zK0nv1DgdOM2sRA6cZmYFuMZpZlaQA2fb3Xw/7PHldpeimCfaXYCCemPEXftN1VCbvHau9Q3ndOA0MysueuOfYwdOMyvP0+0uQGs4cJpZOYKeef/jwGlm5XDgNDNrQI801T3l0szKUalxFjlGIGkvSVdJuk3SGknH5/RPS7pH0k35eENVnpMl9Uu6Q9KhVekzJd2SvztTSsMdJI2TdGFOXyVp6kjlcuA0s/I8XfAY2RbghIh4GTAHOFbSvvm7MyJiRj4uA8jfzQf2A+YBZ0kak68/G1gETM/HvJy+EHgoIqYBZwBfGqlQDpxmVo4m1DgjYn1E3JDPNwG3AZNrZDkcWBYRmyPiTqAfmC1pEjAhIq6JiADOA46oyrM0n18EzK3URofjwGlm5SkeOCdK6qs6Fg1369yEPhBYlZOOk/RrSedK2iWnTQburso2kNMm5/PB6VvliYgtwCPAbrV+TQdOMytH0EhTfWNEzKo6Fg91a0k7At8HPhwRj5Ka3S8GZpAmO51WuXSYkg2XXivPsBw4zaw8JTfVASSNJQXN70bEDwAi4v6IeCoinga+BczOlw8Ae1VlnwLcm9OnDJG+VR5J2wM7AQ/WKpMDp5mVozm96gLOAW6LiNOr0idVXfZm4NZ8fgkwP/eU703qBLouItYDmyTNyfc8Gri4Ks+CfH4ksDK/Bx2Wx3GaWSc7CHgncIukm3Lax4G3SZpBCtfrgPcBRMQaScuBtaQe+WMjnplAfwywBBgPXJ4PSIH5fEn9pJrm/JEKpRECa9uMlWKXkS/rKF4dyYaye7sLUNB6YHNE4SWdZr1c0XdJsTzam9URMavos9rNNU4zK4enXJqZNaBHplw6cJpZOVzjNDMrqIcCZ0uGI+WR/Rsk3Try1WbWtcqfq96RWjWOcwl/nlBvZqNRE8ZxdqqWNNUj4pf1LNVkZl2ui4NhEX7HaWblqMxV7wEdFTjzyiiLwHNBzbqSa5ytl1dGWQxp5lCbi2NmRbjGaWbWgB6pcbZqONIFwDXASyQNSFrYiueaWQu5V71cEfG2VjzHzNrMTXUzswJ6aOaQA6eZladHAqdH/ZiZFeQap5mVw8ORzMwa0CNNdQdOMyuHa5xmZg1wjdPMrAAPRzIza4Cb6mZmBbjGaWZWkAOnmVkD3FQHSZ+s5yYR8dlyimNmXcs1zmf8TdW5gFcD9wG/B14EPB/4RXOKZmZdxzVOiIjXVs4lnQ6sBL4QEZHTTgYmNrWEZtYdXOMc0tHA8ytBM/syqQZ6QqmlMrPu1COBs8jqSH8E9h+U9r+AJ8orjpl1rcqUyyLHCCTtJekqSbdJWiPp+Jy+q6QVkn6Tf+5SledkSf2S7pB0aFX6TEm35O/OlKScPk7ShTl9VT1bmRepcZ4F/FTSN4F1wFTSjpRfL3CPum0HPLcZN26ibvsXZEy7C9CAbqzQvKTdBSjooWeTufz/QFuAEyLiBknPA1ZLWgG8C7gyIr4o6STgJOBESfsC84H9gBcA/ylpn4h4CjibFLOuBS4D5gGXAwuBhyJimqT5wJeAt9YqVN01zoj4AvAx4BX55yuBEyPi8/Xew8xGsSbUOCNifUTckM83AbcBk4HDgaX5sqXAEfn8cGBZRGyOiDuBfmC2pEnAhIi4Jr9uPG9Qnsq9LgLmVmqjwyk0jjMizgfOL5LHzKwMuQl9ILAK2DMi1kMKrpL2yJdNJtUoKwZy2pP5fHB6Jc/d+V5bJD0C7AZsHK4shQKnpL8kVYNfEBHHSdoHGBsRa4rcx8xGqeJN9YmS+qo+L46IxYMvkrQj8H3gwxHxaI0K4VBfRI30WnmGVXdTXdJrgZuBOaQedoDdga/Uew8zG8Ua2x54Y0TMqjqGCppjSUHzuxHxg5x8f25+k39uyOkDwF5V2acA9+b0KUOkb5VH0vbATsCDtX7VIr3qXwSOiog38ed/V24A/neBe5jZaFZ+r7qAc4DbIuL0qq8uARbk8wXAxVXp83NP+d7AdOC63KzfJGlOvufRg/JU7nUksHLQsMttFGmqvzgifprPAyAi/pj/NTCzXtecAfAHAe8EbpF0U077OKkit1zSQuAu4CiAiFgjaTmwltQjf2zuUQc4BlgCjCf1pl+e088BzpfUT6ppzh+pUEUC592S9o+IWysJkg4gDU0ys17XhMAZEVcz9DtIgLnD5DkVOHWI9D62HYtORDxBDrz1KtJUPxP4gaR3AGMk/QPwHeCMIg80s1Gs5KZ6p6q7xhkR38rvBk4kjZ3+DPDVPETJzHqd56oPLfd4bdPrZWYGdHUtsogiw5FuGyb9lvKKY2Zdq7HhSF2pSI1zSsF0M+s1XRwMixgxcEr6eOXaqvOKaeSpSmbW4ypz1XtAPTXOymLGY6vOIf0R3Qe8u+xCmVmXco0ziYi/A5D09Yj4YPOLZGZdqYd61QuN45T0/OoESXtKmlZymcysW/XIOM4igfN7bLu/0O453cx6XQ/1qhcJnPtUT7fM1gD7jJRxuOXvzWyUcY1zGw9LGlzjnAg8XkfeyvL3LyMtS3dsXuLezKzrFAmcK4Cz84KilYVFvw78bKSMNZa/N7PRooea6kUGwJ9EWrfuAUkbgD2A1cCbijxw0PL3ZjaadHEwLKLIIh8bJR0E/BXwItJycn0jLfhZbfDy90N8v4i0C11X7sBo1tM8AH5oOUhel49Chln+fvD9n1lEZJxUd0A2sw7hGidIOjMiPpTPh10VKSIWjXCf4Za/N7PRoocGwI9U4xw7zHlRQy5/HxGXPYt7mlmncVMdIuKYqvN/avQhIyx/b2ajgWucZmYFuXMokfQ0I2zMDhAR7gQ3M9c4s7+pOp8FvB84DbgT+Evgw8A3m1M0M+sqbqonEfGryrmkfwMOi4jf5qQrJa0ELiLtgGlmvc5N9W28mG1Xe7+HVPM0s17XQzXOInPVVwNfkbQDQP75ReDGZhTMzLqQ56pv473Aj4GHquaq/56Cc9XNbJRyr/q2IqJf0v6kZeEmk5rp10ZEF/+7YWal6pFoUHSu+lOS/gt4fkSsb1KZzKwb9VCNs+53nJJ2lHQO8EegP6cdIelTzSqcmVknKtI5dBqwJ2ne+Z9y2vXAW8sulJl1qR7pHCoSOA8D/k9ErCbPJoqIe4AXNKNgZtZlmrACvKRzJW2QdGtV2qcl3SPppny8oeq7kyX1S7pD0qFV6TMl3ZK/OzOv2IakcZIuzOmr8kLrIyoSOEVqplf/UjsCjxW4h5mNZuVv1rYEmDdE+hkRMSMflwHkfczmA/vlPGdJqkwHP5u0SPr0fFTuuRB4KCKmAWcAX6qnUEUC56+AkwelfRC4qsA9zGy0akKNMyJ+CTxYZwkOB5ZFxOaIuJPUFzNb0iRgQkRckxdjPw84oirP0nx+ETC3UhutpUiv+gmkaZbvAHaUdAtpjc65Be5hZqNVa2cOHSfpaKCPtIPuQ6RhktdWXTOQ057M54PTyT/vBoiILZIeAXYDNtZ6eJFxnHflcZyHAXuTBr9fGhF/rJ3TzHpG8eFIEyX1VX1enLfQqeVs4BRSqD6F1HH9boZe8zdqpDPCd8OqK3BK2h54ANgzIr5fTx4z6zGN1Tg3RsSsQo+JuL9yLulbwKX54wCwV9WlU4B7c/qUIdKr8wzkOLcTdbwaqCtw5irsRlLT/Il68jxbTwGbWvGgEnXx6AproqvbXYCCntVf8BYMgJc0qWoCzpuBSo/7JcD3JJ1OGu0zHbguT9zZJGkOaVvyo4GvV+VZAFwDHAmsrGfn3iLvOD8FnC3pxDwMyczsz5rwjlPSBcDBpCb9ACkOHSxpRn7iOuB9ABGxRtJyYC2wBTi2akr4MaQe+vHA5fmAtInk+ZL6STXN+XWVq95t0SU9SdruvDKx6pmMEfGcum5SwPZS7FT2TZusJVVxsyZ7AngqovAeYbPGKfomFcuj37O6aFO9E9T7jnMaaYbQzsBvR7jczHpRD81VHzFwSnoLcCGptvkn4C3e1tfMhtQjL/rrGQD/CeDjwPNI7xc+3tQSmVl3asIA+E5VT+DcGzgtIh4HTgemNbdIZta1yp9y2ZHqecc5JiKeBoiIJyWV3hFkZqNAD+05VE/gfI6k6ub5DoM+ExGfL7dYZtaVurgWWUQ9gfNa4LVVn1cN+hyAA6dZr3ON888i4uAWlMPMrGsU2nPIzKwm1zjNzArwAHgzswa4xmlmVoA7h8zMGuCmuplZMT1S4XTgNLNy9FBL3YHTzMrTIy311gROSTsAvwTG5WdeFBGfasWzzaw1XOMs32bgkIh4TNJY4GpJl0fEtSNlNLPu4RpnifLmR4/lj2PzUd+eHWbWFXqpxlnPepylkDRG0k3ABmBFRKxq1bPNrPl6aB3j1gXOiHgqImaQ9jSeLWn/wddIWiSpT1Kfq6Nm3adH1jFuXeCsiIiHgZ8D84b4bnFEzIqIWYW32DOztnKNs2SSdpe0cz4fD7wGuL0Vzzaz1umVwNmqXvVJwFJJY0jBenlEXNqiZ5tZC/TQ4kgt61X/NXBgK55lZtZsnjlkZqXp5uZ3EQ6cZlYKN9XNzBrgGqeZWQG9NHPIgdPMStMrTfWWD4A3s9GpGQPgJZ0raYOkW6vSdpW0QtJv8s9dqr47WVK/pDskHVqVPlPSLfm7MyUpp4+TdGFOXyVpaj2/qwOnmZWiSTOHlrDtLMOTgCsjYjpwZf6MpH2B+cB+Oc9Zeew4wNnAImB6Pir3XAg8FBHTgDOAL9VTKAdOMytN2XPVI+KXwIODkg8HlubzpcARVenLImJzRNwJ9JPWxZgETIiIa/JKbecNylO510XA3EpttBa/4zSzUrSwc2jPiFgPEBHrJe2R0ycD1Wv8DuS0J/P54PRKnrvzvbZIegTYDdhYqwAOnGZWmgY6hyZK6qv6vDgiFjf4+KFqilEjvVaemhw4zawUDdY4N0bErIJ57pc0Kdc2J5HW+IVUk9yr6ropwL05fcoQ6dV5BiRtD+zEtq8GtuF3nGZWmhatjnQJsCCfLwAurkqfn3vK9yZ1Al2Xm/WbJM3J7y+PHpSncq8jgZX5PWhNrnGaWSmaMeVS0gXAwaQm/QDwKeCLwHJJC4G7gKMAImKNpOXAWmALcGxEVOLzMaQe+vHA5fkAOAc4X1I/qaY5v65y1RFc22J7KXZqdyEKeqLdBTArwRPAUxGF1xJ/iRRnF8wzF1Y30FRvO9c4zawUnnJpZtYAT7k0M7MhdWyNM0ijVs263XvaXYCCljWYz011M7MG9EpT3YHTzErhGqeZWQMcOM3MCvCeQ2ZmDXCN08ysAL/jNDNrgJvqZmYFuMZpZlaQO4fMzBrgGqeZWQFuqpuZNcBNdTOzAlzjNDNrgAOnmVkBvdSr7oWMzcwKco3TzErjprqZWQG91FRvaeCUNAboA+6JiMNa+Wwzaz7XOJvjeOA2YEKLn2tmTdZLw5Fa1jkkaQrwRuDbrXqmmbXW0wWPbtXKGudXgX8GnjfcBZIWAYsA1KJCmVk5XOMsmaTDgA0RsbrWdRGxOCJmRcQsB06z7lIJnEWObtWqGudBwJskvQHYAZgg6TsR8Y4WPd/MWqCbm99FtKTGGREnR8SUiJgKzAdWOmiajS6ucZqZNcA1ziaJiJ97DKfZ6NOsGqekdZJukXSTpL6ctqukFZJ+k3/uUnX9yZL6Jd0h6dCq9Jn5Pv2SzpTUcFeK56qbWWma2FT/u4iYERGz8ueTgCsjYjpwZf6MpH1JrwP3A+YBZ+WJNwBnk0btTM/HvEZ+R3DgNLOSVKZctmgc5+HA0ny+FDiiKn1ZRGyOiDuBfmC2pEnAhIi4JiICOK8qT2EOnGZWmgZqnBMl9VUdi4a4bQA/k7S66vs9I2I9QP65R06fDNxdlXcgp03O54PTG+LOITMrRYMD4DdWNb+Hc1BE3CtpD2CFpNtrXDvUe8uokd4Q1zjNrDTNaKpHxL355wbgh8Bs4P7c/Cb/3JAvHwD2qso+Bbg3p08ZIr0hDpxmVopm9KpL+gtJz6ucA68DbgUuARbkyxYAF+fzS4D5ksZJ2pvUCXRdbs5vkjQn96YfXZWnMDfVzayT7Qn8MI8c2h74XkT8VNL1wHJJC4G7gKMAImKNpOXAWmALcGxEVGL0McASYDxweT4a4sBpZqUpewB8RPwOOGCI9AeAucPkORU4dYj0PmD/MsrlwGlmpeil1ZEcOM2sNA6cZmYFeM8hM7MGuMZpZlaA33GamRXkprqZWQNc4zQzK8A1TjOzBrjG2WZPw8ZN8Psm3HoisLEJ922mbitzt5UXmljmM5tx0+b+Gb+okUzuHOoAEbF7M+4rqa+OZaw6SreVudvKC91X5k4tr5vqZmYFuMZpZtYAB87Ra3G7C9CAbitzt5UXuq/MHVfeXupVV9q3yMzs2dlRiqJrtq2C1Z34rnYkXgHezKygngmckublDer7JZ3U7vKMRNK5kjZIurXdZamXpL0kXSXpNklrJB3f7jLVImkHSddJujmX9zPtLlO9JI2RdKOkS9tdlooWbw/cVj0ROPOG9N8AXg/sC7wtb1zfyZYA89pdiIK2ACdExMuAOcCxHf7nvBk4JCIOAGYA8yTNaXOZ6nU8cFu7CzFY2XsOdaqeCJykXfH6I+J3EfEnYBlp4/qOFRG/BB5sdzmKiIj1EXFDPt9E+ovd8N7VzRbJY/nj2Hx0/Et/SVOANwLfbndZqjVjs7ZO1SuBc7hN6q1JJE0FDgRWtbckteUm702k7WVXRERHlzf7KvDPdGBr10310aXUzeitNkk7At8HPoxZcGQAAANtSURBVBwRj7a7PLVExFMRMYO0z/ZsSaVs5tUskg4DNkTE6naXZTDXOEef4Tapt5JJGksKmt+NiB+0uzz1ioiHgZ/T+e+VDwLeJGkd6ZXTIZK+094iJQ6co8/1wHRJe0t6DjCftHG9lUhp8+tzgNsi4vR2l2ckknaXtHM+Hw+8Bri9vaWqLSJOjogpETGV9P/xyoh4R5uL9Qw31UeRiNgCHAdcQeqwWB4Ra9pbqtokXQBcA7xE0oCkhe0uUx0OAt5JqgXdlI83tLtQNUwCrpL0a9I/risiomOG93SbXqpxeuaQmZVinBQvKJhnXZfOHOrFuepm1gReHcnMrAEOnGZmBfTS6kgOnGZWml6pcfZEr7qZNV+zetU7cYEeB07rSJKWSOqoudg2srLHcXbqAj0OnLYVSZ+QFJKOLpAnJL2qmeWyztekGmdHLtDjwGnPkLQdsJC0KtP72lwcM+jQBXrcOWTVDiXN4z8CuFTS/hFxK4CklwP/CswExpAGLr9W0s05788kPQ0si4j35LnUn4iI7+T8U4E7gb0iYkDSXODzwD6kdTyvBD4UERta86ta2Z6GKzal/d6L2EFSX9XnxRFRvZ9SRy7Q48Bp1d4HXB4RP8kBcRHwIUmTgF+QAuc/AE8CrwaIiAMkBfC6iLi6wLM2k6bB3kj6y7Yc+BrwtrJ+GWutiGjGAikduUCPm+oGgKQXkBbHPTcnnQu8My9+8U7Se6YvRMTjEfGniPjPZ/O8iLg6Iq6PiC0RcR8pKM99Nve0UakjF+hx4LSKyrvNyiIX3wHGA28FpgL/XebDJM2UdIWk+yQ9ClwA7F7mM6z7deoCPQ6cVukUeg+wMzAg6T5gLeld5iJgHTC9xi2Geuf0GPAXVZ8Hr/+wDLgB2CciJuAmug0jIi6LiH0i4sURcWq7ywMOnJbMI707eiVp07LK8UbgFaTtL14i6URJz5U0NnfuVNzHtoG1jzTmbkdJuwP/Muj7CcAjwCZJLwQ6YmCzWT0cOA1Sp9CPImJ1RNxXdfyMtCboUcDBwGtJL+vvB06syv//gM9KekjSN3PaJ0hD9daTVlZfNuiZi0i13E3AD4D/aMYvZtYMXo/TzKwg1zjNzApy4DQzK8iB08ysIAdOM7OCHDjNzApy4DQzK8iB08ysIAdOM7OCHDjNzAr6/6Vf9etnx4TNAAAAAElFTkSuQmCC",
            "text/plain": [
              "<Figure size 360x360 with 2 Axes>"
            ]
          },
          "metadata": {
            "needs_background": "light"
          },
          "output_type": "display_data"
        }
      ],
      "source": [
        "trainer = Trainer(net=attn_model, lr=1e-3, batch_size=96, num_epochs=10)\n",
        "trainer.run()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "jupyter": {
          "outputs_hidden": false,
          "source_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        }
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "interpreter": {
      "hash": "7a6183492d0e103ac878e198fb5e468f3d279e98271ee06042fca66727adf0ef"
    },
    "kernel_info": {
      "name": "python3"
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    },
    "microsoft": {
      "host": {
        "AzureML": {
          "notebookHasBeenCompleted": true
        }
      }
    },
    "nteract": {
      "version": "nteract-front-end@1.0.0"
    },
    "orig_nbformat": 4
  },
  "nbformat": 4,
  "nbformat_minor": 0
}


================================================
FILE: experiments/ecg_cnn/config.yaml
================================================
# Basic configuration file for running ecg_cnn example using json files.
# Parameters needed to initialize the model
model_config:
    model_type: SuperNet                               # class w/ `loss` and `inference` methods
    model_folder: experiments/ecg_cnn/model.py         # file containing class

# Configuration for differential privacy
dp_config:
    enable_local_dp: false                             # whether to enable user-level DP

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false                               # cache data to compute additional metrics

# Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx)
strategy: DGA

# Determines all the server-side settings for training and evaluation rounds
server_config:   
    wantRL: false                                      # whether to use RL-based meta-optimizers
    resume_from_checkpoint: false                      # restart from checkpoint if file exists
    do_profiling: false                                # run profiler and compute runtime metrics
    optimizer_config:                                  # this is the optimizer used to update the model
        type: sgd
        lr: 1.0
    annealing_config:                                  # annealer for the learning rate
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 50                                       # how many iterations between metric eval on val set
    rec_freq: 500                                      # how many iterations between metric eval on test set
    initial_val: true
    initial_rec: true
    max_iteration: 2000                                # how many iterations in total
    num_clients_per_iteration: 25                      # how many clients per iteration
    data_config:                                       # where to get val and test data from
        val:
            batch_size: 10000
            val_data: test_data.hdf5
        test:
            batch_size: 10000
            test_data: test_data.hdf5
    type: model_optimization
    aggregate_median: softmax                          # how aggregations weights are computed
    softmax_beta: 20.0
    initial_lr_client: 0.001                           # learning rate used on client optimizer
    lr_decay_factor: 1.0
    weight_train_loss: train_loss
    best_model_criterion: loss
    fall_back_to_best_model: false

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    do_profiling: false                                # run profiling and compute runtime metrics
    ignore_subtask: false
    data_config:                                       # where to get training data from
        train:
            batch_size: 96
            list_of_train_data: train_data.hdf5
            desired_max_samples: 87000
    optimizer_config:                                  # this is the optimizer used by the client
        type: sgd 
        lr: 0.001                                      # this is overridden by `initial_lr_client`
        momentum: 0.90
    type: optimization

================================================
FILE: experiments/ecg_cnn/dataloaders/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from experiments.ecg_cnn.dataloaders.dataset import Dataset
from core.dataloader import BaseDataLoader

import torch

class DataLoader(BaseDataLoader):
    def __init__(self, mode, num_workers=0, **kwargs):
        args = kwargs['args']
        self.batch_size = args['batch_size']

        dataset = Dataset(
            data=kwargs['data'],
            test_only=(not mode=='train'),
            user_idx=kwargs.get('user_idx', None),
            file_type='hdf5',
        )

        super().__init__(
            dataset,
            batch_size=self.batch_size,
            shuffle=(mode=='train'),
            num_workers=num_workers,
            collate_fn=self.collate_fn,
        )

    def collate_fn(self, batch):
        x, y = list(zip(*batch))
        return {'x': torch.tensor(x), 'y': torch.tensor(y)}

================================================
FILE: experiments/ecg_cnn/dataloaders/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import h5py
import numpy as np

from core.dataset import BaseDataset

class Dataset(BaseDataset):
    def __init__(self, data, test_only=False, user_idx=0, **kwargs):
        self.test_only = test_only
        self.user_idx = user_idx

        # Get all data
        self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data)

        if self.test_only:  # combine all data into single array
            self.user = 'test_only'
            self.features = np.vstack([user_data['x'] for user_data in self.user_data.values()])
            self.labels = np.hstack([user_label['x'] for user_label in self.user_data_label.values()])
        else:  # get a single user's data
            if user_idx is None:
                raise ValueError('in train mode, user_idx must be specified')

            self.user = self.user_list[user_idx]
            self.features = self.user_data[self.user]['x']
            self.labels = self.user_data_label[self.user]['x']

    def __getitem__(self, idx):
        items = self.features[idx].astype(np.float32).T.reshape(1,187)
        return items, self.labels[idx]

    def __len__(self):
        return len(self.features)

    def load_data(self,data):
        '''Load data from disk or memory'''

        if isinstance(data, str):
            try:
                data = h5py.File(data, 'r')
            except:
                raise ValueError('Only HDF5 format is allowed for this experiment')

            users = []
            num_samples = data['num_samples']
            features, labels = dict(), dict()
            
            # Decoding bytes from hdf5
            decode_if_str = lambda x: x.decode() if isinstance(x, bytes) else x
            for user in data['users']:
                user = decode_if_str(user)
                users.append(user)
                features[user] = {'x': data['user_data'][user]['x'][()]}
                labels[user] = {'x': data['user_data_label'][user][()]}

        else:
        
            users = data['users']
            features = data['user_data']
            labels = data['user_data_label']
            num_samples = data['num_samples']
            
        return users, features, labels, num_samples

================================================
FILE: experiments/ecg_cnn/model.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

'''The model architecture used was first created by the user polomarco for a Kaggle competition:
https://www.kaggle.com/polomarco/ecg-classification-cnn-lstm-attention-mechanism
However, this example has been altered to fit the FLUTE architecture'''

import torch
from torch import nn
from torch.nn import functional as F

from core.model import BaseModel

# ReLu alternative 
class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)

class ConvNormPool(nn.Module):
    """Conv Skip-connection module"""
    def __init__(
        self,
        input_size,
        hidden_size,
        kernel_size,
        norm_type='bachnorm'
    ):
        super().__init__()
        
        self.kernel_size = kernel_size
        self.conv_1 = nn.Conv1d(
            in_channels=input_size,
            out_channels=hidden_size,
            kernel_size=kernel_size
        )
        self.conv_2 = nn.Conv1d(
            in_channels=hidden_size,
            out_channels=hidden_size,
            kernel_size=kernel_size
        )
        self.conv_3 = nn.Conv1d(
            in_channels=hidden_size,
            out_channels=hidden_size,
            kernel_size=kernel_size
        )
        self.swish_1 = Swish()
        self.swish_2 = Swish()
        self.swish_3 = Swish()
        if norm_type == 'group':
            self.normalization_1 = nn.GroupNorm(
                num_groups=8,
                num_channels=hidden_size
            )
            self.normalization_2 = nn.GroupNorm(
                num_groups=8,
                num_channels=hidden_size
            )
            self.normalization_3 = nn.GroupNorm(
                num_groups=8,
                num_channels=hidden_size
            )
        else:
            self.normalization_1 = nn.BatchNorm1d(num_features=hidden_size)
            self.normalization_2 = nn.BatchNorm1d(num_features=hidden_size)
            self.normalization_3 = nn.BatchNorm1d(num_features=hidden_size)
            
        self.pool = nn.MaxPool1d(kernel_size=2)
        
    def forward(self, input):
        conv1 = self.conv_1(input)
        x = self.normalization_1(conv1)
        x = self.swish_1(x)
        x = F.pad(x, pad=(self.kernel_size - 1, 0))
        
        x = self.conv_2(x)
        x = self.normalization_2(x)
        x = self.swish_2(x)
        x = F.pad(x, pad=(self.kernel_size - 1, 0))
        
        conv3 = self.conv_3(x)
        x = self.normalization_3(conv1+conv3)
        x = self.swish_3(x)
        x = F.pad(x, pad=(self.kernel_size - 1, 0))   
        
        x = self.pool(x)
        return x

class RNN(nn.Module):
    """RNN module(cell type lstm or gru)"""
    def __init__(
        self,
        input_size,
        hid_size,
        num_rnn_layers=1,
        dropout_p = 0.2,
    ):
        super().__init__()
        
        self.rnn_layer = nn.LSTM(
            input_size=input_size,
            hidden_size=hid_size,
            num_layers=num_rnn_layers,
            dropout=dropout_p if num_rnn_layers>1 else 0,
            bidirectional=False,
            batch_first=True,
        )
        
    def forward(self, input):
        outputs, hidden_states = self.rnn_layer(input)
        return outputs, hidden_states

class Net(nn.Module): 
    def __init__(
        self,
        input_size=1,
        hid_size=64,
        n_classes=5,
        kernel_size=5,
    ):
        super().__init__()
 
        self.rnn_layer = RNN(
            input_size=46,
            hid_size=hid_size,
        )
        self.conv1 = ConvNormPool(
            input_size=input_size,
            hidden_size=hid_size,
            kernel_size=kernel_size,
        )
        self.conv2 = ConvNormPool(
            input_size=hid_size,
            hidden_size=hid_size,
            kernel_size=kernel_size,
        )
        self.avgpool = nn.AdaptiveMaxPool1d((1))
        self.attn = nn.Linear(hid_size, hid_size, bias=False)
        self.fc = nn.Linear(in_features=hid_size, out_features=n_classes)
        
    def forward(self, input):
        x = self.conv1(input)
        x = self.conv2(x)
        x_out, hid_states = self.rnn_layer(x)
        x = torch.cat([hid_states[0], hid_states[1]], dim=0).transpose(0, 1)
        x_attn = torch.tanh(self.attn(x))
        x = x_attn.bmm(x_out)
        x = x.transpose(2, 1)
        x = self.avgpool(x)
        x = x.view(-1, x.size(1) * x.size(2))
        x = F.softmax(self.fc(x), dim=-1)
        return x

class SuperNet(BaseModel):
    '''This is the parent of the net with some extra methods'''
    def __init__(self, model_config):
        super().__init__()
        self.net = Net()
    
    def loss(self, input: torch.Tensor):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)
        return F.cross_entropy(output, labels.long())

    def inference(self, input):
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(features)
        n_samples = features.shape[0]
 
        accuracy = torch.mean((torch.argmax(output, dim=1) == labels).float()).item()

        return {'output':output, 'acc': accuracy, 'batch_size': n_samples}
        

================================================
FILE: experiments/ecg_cnn/readme.md
================================================
# Example of CNN-LSTM model on Arrhythmia dataset

The objective of this experiment is to show the capabilities of FLUTE in data setting relevant to the healthcare sector. 

### Federating the MIT-BIH Arrhythmia Dataset

In this experiment, a processed version of [MIT-BIH Arrhythmia Dataset](https://www.physionet.org/content/mitdb/1.0.0/) is used. In particular, we are using the dataset version found on [this Kaggle competition](https://www.kaggle.com/shayanfazeli/heartbeat). 

Excerpt from the original [MIT-BIH Arrhythmia Database](https://physionet.org/content/mitdb/1.0.0/): 

> The MIT-BIH Arrhythmia Database contains 48 half-hour excerpts of two-channel ambulatory ECG recordings, obtained from 47 subjects studied by the BIH Arrhythmia Laboratory between 1975 and 1979. Twenty-three recordings were chosen at random from a set of 4000 24-hour ambulatory ECG recordings collected from a mixed population of inpatients (about 60%) and outpatients (about 40%) at Boston's Beth Israel Hospital; the remaining 25 recordings were selected from the same set to include less common but clinically significant arrhythmias that would not be well-represented in a small random sample.

What this means for us: the federation in this example is a exemplar one, as the 47 subjects and their 48 half-hour excerpts are split up into the 109446 labeled samples of length 187. The sampling frequency is 125Hz and the number of categories is five. The categories are: 

```['N': 0, 'S': 1, 'V': 2, 'F': 3, 'Q': 4]```

Or: 

```-N : Non-ecotic beats (normal beat) -S : Supraventricular ectopic beats -V : Ventricular ectopic beats -F : Fusion Beats -Q : Unknown Beats```

The classes in the dataset are quite skewed; the *normal beats* class is present in 82.77% of samples. Using synthetic data could possibly increase the performance of the models by decreasing the class imbalance (e.g. by using [this GAN]([GitHub - mandrakedrink/ECG-Synthesis-and-Classification: 1D GAN for ECG Synthesis and 3 models: CNN, LSTM, and Attention mechanism for ECG Classification.](https://github.com/mandrakedrink/ECG-Synthesis-and-Classification)) for data synthesis) but is not too relevant for our experiment of transferring this experiment to FLUTE. 

#### Model architecture

The model architecture is largely taken from [this notebook on Kaggle](https://www.kaggle.com/polomarco/ecg-classification-cnn-lstm-attention-mechanism). The architecture has been altered to fit  the FLUTE architecture. The image below showcases the general model architecture. 

![network](./net.png)

The FLUTE-ready model can be found in `model.py`. Here, `SuperNet` is the parent class of the model various model network classes. `SuperNet` contains the `loss` and `inference` methods which FLUTE expects. `SuperNet` is therefore also the `model_type` set in `config.yaml`. 

The file `centralized_model.ipynb` can be used to test a centralized run of the model. Running this model expects the csv test and train files to be added to a `.\ecg_cnn\data\mitbih\` folder. This model has higher performance than the remote model (roughly 94% as opposed to 87% accuracy). This not fully unexpected, since the federated model could have more issues dealing with the class imbalance. 

#### Preparing the data

First, place the `mitbih_test.csv` and `mitbig_train.csv` files in the folder `.\ecg_cnn\data\mitbih\`. Next, run preprocess.py in the `utils` folder to generate the HDF5 files. 

## Specifying dataset and dataloaders

Inside the `dataloaders` folder, there are two files: `dataset.py` and
`dataloader.py`. Both inherit from the base classes declared in `core`
folder, that under the hood inhereit from Pytorch classes with same name.

The dataset should be able to access all the data, and store it in the
attributes `user_list`, `user_data`, `user_data_labels` and `num_samples` (user
names, user features, user labels if the problem is supervised, and number of
samples for each user, respectively). These attributes are required to have
these exact names. Otherwise, it should also be able to access the examples of a
specific user, which id is passed during initialization via the `user_idx`
argument.

The dataloader is simpler, and essentially just instantiates the dataset and
creates batches with a specific format.

## Creating a config file

All the parameters of the experiment are passed in a YAML file. A documented
example is provided in `config.yaml`.

## Running the experiment locally

Finally, to launch the experiment, it suffices to launch the `e2e_trainer.py`
script using torch.distributed:

`python -m torch.distributed.run --nproc_per_node=2 .\e2e_trainer.py -dataPath experiments/ecg_cnn/data -outputPath scratch -config experiments/ecg_cnn/config.yaml -task ecg_cnn -backend nccl`

The `dataPath`, `outputPath` and `config` arguments should just specify the
respective files or folders, as in the example above -- in this case, a folder
called `scratch` will be created containing logs and checkpoints. The task
should be the name of the folder insider `experiments

## Running the experiments on Azure Machine Learning

In order to run the experiment on Azure Machine Learning, you first need to follow the steps described [here](#Experiments).
Make sure the HDF5 dataset is uploaded, the compute has a GPU and is running, and your YAML file is properly set up. An example file for running this experiment would be the following: 

```yaml
experiment_name: ecg_cnn_run 
description: FLUTE heartbeat dataset example 
code:
  local_path: .
compute: azureml:compute_with_gpu
environment:
  image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
inputs:
  data:
    folder: azureml://datastores/workspaceblobstore/paths/data
    mode: rw_mount
command: >
  apt -y update &&
  apt -y install openmpi-bin libopenmpi-dev openssh-client &&
  python3 -m pip install --upgrade pip &&
  python3 -m pip install -r requirements.txt &&
  python -m torch.distributed.run --nproc_per_node=4 e2e_trainer.py
  -outputPath=./outputs
  -dataPath={inputs.data}
  -task=ecg_cnn
  -config=./experiments/ecg_cnn/config.yaml
  -backend=nccl

```
To run your job, you can then use the following command: 
`az ml job create -f ./run.yaml -w "YourWorkspaceName" -g "YourResourceGroup"`

The job should now be created and uploaded, after which it can be found in the AzureML Studio. 

================================================
FILE: experiments/ecg_cnn/utils/preprocess.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import h5py
import time
import tqdm
import csv
import pandas as pd
from sklearn.utils import resample

def _dump_dict_to_hdf5(data_dict: dict, hdf5_file: h5py.File):
    '''Dump dict with expected structure to HDF5 file'''

    hdf5_file.create_dataset('users', data=data_dict['users'])
    hdf5_file.create_dataset('num_samples', data=data_dict['num_samples'])

    # Store actual data in groups
    user_data_group = hdf5_file.create_group('user_data')
    for user, user_data in tqdm.tqdm(data_dict['user_data'].items()):
        user_subgroup = user_data_group.create_group(user)
        user_subgroup.create_dataset('x', data=user_data) 

    user_data_label_group = hdf5_file.create_group('user_data_label')
    for user, user_data_label in tqdm.tqdm(data_dict['user_data_label'].items()):
        user_data_label_group.create_dataset(user, data=user_data_label) 

def _process_and_save_to_disk(dataset, n_users, output):
    '''Process the dataset to expected format and save to disk'''

    # Split training data equally among all users
    total_samples = len(dataset)
    samples_per_user = total_samples // n_users
    assert total_samples % n_users == 0

    # Function for getting a given user's data indices
    user_idxs = lambda user_id: slice(user_id * samples_per_user, (user_id + 1) * samples_per_user)

    # Convert training data to expected format
    print('Converting data to expected format...')
    start_time = time.time()

    data_dict = {  # the data is expected to have this format
        'users' : [f'{user_id:04d}' for user_id in range(n_users)],
        'num_samples' : n_users * [samples_per_user],
        'user_data' : {f'{user_id:04d}': dataset.data[user_idxs(user_id)] for user_id in range(n_users)},
        'user_data_label': {f'{user_id:04d}': dataset.targets[user_idxs(user_id)] for user_id in range(n_users)},
    }
    print(f'Finished converting data in {time.time() - start_time:.2f}s.')

    # Save training data to disk
    print('Saving data to disk...')
    start_time = time.time()

    with h5py.File(output + '.hdf5', 'w') as hdf5_file:
        _dump_dict_to_hdf5(data_dict=data_dict, hdf5_file=hdf5_file)
    print(f'Finished saving data in {time.time() - start_time:.2f}s.')

class HeartDataSet: 
    def __init__(self, heartdata, cutoff):
        self.data = [row[:187] for row in heartdata][:cutoff]
        self.targets = [int(float(row[187])) for row in heartdata][:(round(len(heartdata), -3))][:cutoff]

    def __len__(self):
        return len(self.data)  


# From https://www.kaggle.com/gregoiredc/arrhythmia-on-ecg-classification-using-cnn/notebook
# Can be used to creating resampled training set for less class imbalance
def resampleSet(train_df): 
    train_df[187]=train_df[187].astype(float).astype(int)
    df_1=train_df[train_df[187]==1]
    df_2=train_df[train_df[187]==2]
    df_3=train_df[train_df[187]==3]
    df_4=train_df[train_df[187]==4]
    df_0=(train_df[train_df[187]==0]).sample(n=40001,random_state=42)

    df_1_upsample=resample(df_1,replace=True,n_samples=10000,random_state=123)
    df_2_upsample=resample(df_2,replace=True,n_samples=20000,random_state=124)
    df_3_upsample=resample(df_3,replace=True,n_samples=5000,random_state=125)
    df_4_upsample=resample(df_4,replace=True,n_samples=20000,random_state=126)

    train_df=pd.concat([df_0,df_1_upsample,df_2_upsample,df_3_upsample,df_4_upsample])
    return train_df

# Uncomment lines below for resampled dataset
with open('../data/mitbih/mitbih_test.csv') as f: 
    testset = list(csv.reader(f , delimiter=','))
TestDataset = HeartDataSet(testset, 21000)
_process_and_save_to_disk(TestDataset,1000,'../data/test_data')

with open('../data/mitbih/mitbih_train.csv') as f: 
    trainset = csv.reader(f , delimiter=',')
    trainsetlist = list(trainset) 
TrainDataset = HeartDataSet(trainsetlist, 87000)
_process_and_save_to_disk(TrainDataset,1000,'../data/train_data')


================================================
FILE: experiments/fednewsrec/README.md
================================================
### Data

In order to run this experiment, you need to previously download the MIND dataset [here](https://msnews.github.io/index.html) and the glove.840B.300d embbeding vector [here](https://nlp.stanford.edu/projects/glove/). Once you have the data, make sure to replace the `root_data_path` and `embedding_path` parameters inside [dataset.py](dataloaders/dataset.py) and [configuration file](config.yaml). The preprocessing steps will be done automatically by FLUTE once the jobs is launched.

### Run

Once the paths for the dataset and embedding have been updated, you can run the experiment as follows:

```code

    python -m torch.distributed.run  --nproc_per_node=4  e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest  -config ./experiments/fednewsrec/config.yaml -task fednewsrec -backend nccl
    
```
### Results

- MIND_Large, 1500 rounds, 6 clients per round:

|Platform|AUC|MRR|nDCG5|nDCG10|
|:----|:----|:----|:----|:----|
|FedNews|0.54|0.23|0.25|0.32|
|FLUTE|0.58|0.24|0.26|0.33| 


================================================
FILE: experiments/fednewsrec/config.yaml
================================================
# Parameters needed to initialize the model
model_config:
    model_type: FEDNEWS                                    # class w/ `loss` and `inference` methods
    model_folder: experiments/fednewsrec/model.py     # file containing class
    embbeding_path: /mnt/data/MIND_large
# Configuration for differential privacy
dp_config:
    enable_local_dp: false                             # whether to enable user-level DP

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false                               # cache data to compute additional metrics

# Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx)
strategy: FedAvg

# Determines all the server-side settings for training and evaluation rounds
server_config:   
    wantRL: false                                      # whether to use RL-based meta-optimizers
    resume_from_checkpoint: true                      # restart from checkpoint if file exists
    do_profiling: false                                # run profiler and compute runtime metrics
    optimizer_config:                                  # this is the optimizer used to update the model
        type: sgd
        lr: 1.0
    annealing_config:                                  # annealer for the learning rate
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 50                                       # how many iterations between metric eval on val set
    rec_freq: 2000                                      # how many iterations between metric eval on test set
    initial_val: true
    initial_rec: false
    max_iteration: 1500                                # how many iterations in total
    num_clients_per_iteration: 500                     # how many clients per iteration
    data_config:                                       # where to get val and test data from
        val:
            batch_size: 1
            val_data: null                             # Assigned to null because dataset is being instantiated
        test:
            batch_size: 1
            test_data: null                            # Assigned to null because dataset is being instantiated
    type: model_optimization
    aggregate_median: softmax                          # how aggregations weights are computed
    initial_lr_client: 0.1                           # learning rate used on client optimizer
    lr_decay_factor: 1.0
    weight_train_loss: train_loss
    best_model_criterion: auc
    fall_back_to_best_model: false
    softmax_beta: 1.0

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    do_profiling: false                                # run profiling and compute runtime metrics
    ignore_subtask: false
    data_config:                                       # where to get training data from
        train:
            batch_size: 64
            list_of_train_data: null                   # Assigned to null because dataset is being instantiated
            desired_max_samples: 50000
    optimizer_config:                                  # this is the optimizer used by the client
        type: sgd
        lr: 0.1                                      # this is overridden by `initial_lr_client`
    type: optimization

================================================
FILE: experiments/fednewsrec/dataloaders/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
import numpy as np
from core.dataloader import BaseDataLoader
from experiments.fednewsrec.dataloaders.dataset import Dataset

class DataLoader(BaseDataLoader):
    def __init__(self, mode, num_workers=0, **kwargs):
        args = kwargs['args']
        self.batch_size = args['batch_size']
        self.mode = mode

        dataset = Dataset(
            data=kwargs['data'],
            test_only=(not mode=='train'),
            user_idx=kwargs.get('user_idx', None),
        )

        super().__init__(
            dataset,
            batch_size=self.batch_size,
            shuffle=(mode=='train'),
            num_workers=num_workers,
            collate_fn=self.collate_fn,
        )

    def collate_fn(self, batch):
        if self.mode == "train": # For training
            click, sample, label = list(zip(*batch))
            click = torch.tensor(click)
            sample = torch.tensor(sample)
            label = torch.tensor(label)
            return {'x': (click, sample), 'y': label}

        else: # For testing -- data format is different
            nv_hist = torch.stack(batch[0][0]).squeeze(1) 
            nv_imp = torch.stack(batch[0][1]).squeeze(1)
            label = batch[0][2]
            return {'x': (nv_hist, nv_imp), 'y': label}


================================================
FILE: experiments/fednewsrec/dataloaders/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
import torch
from core.dataset import BaseDataset
from experiments.fednewsrec.dataloaders.preprocess_mind import MIND

class Dataset(BaseDataset):
    def __init__(self, data, test_only=False, user_idx=0, **kwargs):
        self.test_only = test_only
        self.user_idx = user_idx

        # Get all data
        self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only)

        if user_idx != -1:
            if self.test_only:  # combine all data into single array
                self.user = 'test_only'
                self.labels = [user_label for user_label in self.user_data_label.values()]
                self.features_x = [user_data['x'] for user_data in self.user_data.values()]
                self.features_y = [user_data['y'] for user_data in self.user_data.values()]
            else:  # get a single user's data
                if user_idx is None:
                    raise ValueError('in train mode, user_idx must be specified')

                self.user = self.user_list[user_idx]
                self.features_x = self.user_data[self.user]['x']
                self.features_y = self.user_data[self.user]['y']
                self.labels = self.user_data_label[self.user]

    def __getitem__(self, idx):
        return self.features_x[idx], self.features_y[idx], self.labels[idx]

    def __len__(self):
        return len(self.features_x)

    def load_data(self, data, test_only):
        '''Wrapper method to read/instantiate the dataset'''

        if data == None:
            dataset = MIND(root_data_path="/mnt/data/MIND_large", embedding_path="/mnt/data/MIND_large")
            data = dataset.testset if test_only else dataset.trainset
        
        users = data['users']
        features = data['user_data']
        labels = data['user_data_label']
        num_samples = data['num_samples']
            
        return users, features, labels, num_samples

================================================
FILE: experiments/fednewsrec/dataloaders/preprocess_mind.py
================================================
from nltk.tokenize import word_tokenize
import random
import os
import numpy as np
import torch

MAX_SENTENCE = 30
MAX_ALL = 50
npratio = 4

''' 
    Preprocessing steps for MIND dataset were taken from the FedNewsRec-EMNLP-Findings-2020
    repository, for more information please refer to https://github.com/taoqi98/FedNewsRec/blob/master/code/preprecoess.py.
'''

class MIND:
    def __init__(self, root_data_path, embedding_path) :

        # Preprocessing
        news,news_index,category_dict,subcategory_dict,word_dict = read_news(root_data_path,['train','val'])
        news_title,news_vert,news_subvert=get_doc_input(news,news_index,category_dict,subcategory_dict,word_dict)
        title_word_embedding_matrix, have_word = load_matrix(embedding_path,word_dict)
        train_session, train_uid_click, train_uid_table = read_clickhistory(root_data_path,'train')
        test_session, test_uid_click,test_uid_table = read_clickhistory(root_data_path,'val')
        train_user = parse_user(train_session,news_index)
        test_user = parse_user(test_session,news_index)
        train_sess, train_user_id, train_label, train_user_id_sample = get_train_input(train_session,train_uid_click,news_index)
        test_impressions, test_userids = get_test_input(test_session,news_index)
        get_user_data = GetUserDataFunc(news_title,train_user_id_sample,train_user,train_sess,train_label,train_user_id)
        
        # Return datasets
        print("Preparing train datasets ...")
        train_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()}

        for uidx in range(50000):
            uid = train_uid_table[uidx]
            click, sample, label = get_user_data(uid)
            user = str(uidx) # uid
            train_dict['users'].append(user)
            train_dict['num_samples'].append(len(click))
            train_dict['user_data'][user] = {'x': click, 'y': sample}
            train_dict['user_data_label'][user] = label

        print("Preparing test datasets ...")
        test_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()}
        doc_cache = []

        for j in range(len(news_title)):
            doc_cache.append(torch.from_numpy(np.array([news_title[j]])))

        for i in range(len(test_impressions)):
            docids = test_impressions[i]['docs']
            labels = test_impressions[i]['labels']
            nv_hist = [doc_cache[j] for j in test_user['click'][i]]
            nv_imp = [doc_cache[j] for j in docids]
            user = str(i)
            test_dict['users'].append(user)
            test_dict['num_samples'].append(len(nv_imp))
            test_dict['user_data'][user] = {'x':nv_hist, 'y':nv_imp}
            test_dict['user_data_label'][user] = labels
        self.trainset = train_dict
        self.testset = test_dict

def GetUserDataFunc(news_title,train_user_id_sample,train_user,train_sess,train_label,train_user_id):
    def _get_user_data(uid):
        click = []
        sample = []
        label = []
        for sid in train_user_id_sample[uid]:
            click.append(train_user['click'][train_user_id[sid]])
            sample.append(train_sess[sid])
            label.append(train_label[sid])
        click = np.array(click)
        sample = np.array(sample)
        label = np.array(label)
        click = news_title[click]
        sample = news_title[sample]        
        return click,sample,label
    return _get_user_data

def newsample(nnn,ratio):
    if ratio >len(nnn):
        return random.sample(nnn*(ratio//len(nnn)+1),ratio)
    else:
        return random.sample(nnn,ratio)

def read_news(root_data_path,modes):
    news={}
    category=[]
    subcategory=[]
    news_index={}
    index=1
    word_dict={}
    word_index=1
    
    for mode in modes:
        with open(os.path.join(root_data_path,mode,'news.tsv'), encoding="utf8") as f:
            lines = f.readlines()
        for line in lines:
            splited = line.strip('\n').split('\t')
            doc_id,vert,subvert,title= splited[0:4]
            if doc_id in news_index:
                continue
            news_index[doc_id]=index
            index+=1
            category.append(vert)
            subcategory.append(subvert)
            title = title.lower()
            title=word_tokenize(title)
            news[doc_id]=[vert,subvert,title]
            for word in title:
                word = word.lower()
                if not(word in word_dict):
                    word_dict[word]=word_index
                    word_index+=1
    category=list(set(category))
    subcategory=list(set(subcategory))
    category_dict={}
    index=1
    for c in category:
        category_dict[c]=index
        index+=1
    subcategory_dict={}
    index=1
    for c in subcategory:
        subcategory_dict[c]=index
        index+=1
    return news,news_index,category_dict,subcategory_dict,word_dict

def get_doc_input(news,news_index,category,subcategory,word_dict):
    news_num=len(news)+1
    news_title=np.zeros((news_num,MAX_SENTENCE),dtype='int32')
    news_vert=np.zeros((news_num,),dtype='int32')
    news_subvert=np.zeros((news_num,),dtype='int32')
    for key in news:    
        vert,subvert,title=news[key]
        doc_index=news_index[key]
        news_vert[doc_index]=category[vert]
        news_subvert[doc_index]=subcategory[subvert]
        for word_id in range(min(MAX_SENTENCE,len(title))):
            news_title[doc_index,word_id]=word_dict[title[word_id].lower()]
        
    return news_title,news_vert,news_subvert


def load_matrix(embedding_path,word_dict):
    embedding_matrix = np.zeros((len(word_dict)+1,300))
    have_word=[]
    with open(os.path.join(embedding_path,'glove.840B.300d.txt'),'rb') as f:
        while True:
            l=f.readline()
            if len(l)==0:
                break
            l=l.split()
            word = l[0].decode()
            if word in word_dict:
                index = word_dict[word]
                tp = [float(x) for x in l[1:]]
                embedding_matrix[index]=np.array(tp)
                have_word.append(word)
    return embedding_matrix,have_word


def read_clickhistory(root_data_path,mode):
    
    lines = []
    userids = {}
    uid_table = {}
    with open(os.path.join(root_data_path,mode,'behaviors.tsv')) as f:
        lines = f.readlines()
        
    sessions = []
    for i in range(len(lines)):
        _,uid,_,click,imp = lines[i].strip().split('\t')
        true_click = click.split()
        assert not '' in true_click
        if not uid in userids:
            uid_table[len(userids)] = uid
            userids[uid] = []
        userids[uid].append(i)
        imp = imp.split()
        pos = []
        neg = []
        for beh in imp:
            nid, label = beh.split('-')
            if label == '0':
                neg.append(nid)
            else:
                pos.append(nid)
        sessions.append([true_click,pos,neg])
    return sessions,userids,uid_table

def parse_user(session,news_index):
    user_num = len(session)
    user={'click': np.zeros((user_num,MAX_ALL),dtype='int32'),}
    for user_id in range(len(session)):
        tclick = []
        click, pos, neg =session[user_id]
        for i in range(len(click)):
            tclick.append(news_index[click[i]])
        click = tclick

        if len(click) >MAX_ALL:
            click = click[-MAX_ALL:]
        else:
            click=[0]*(MAX_ALL-len(click)) + click
            
        user['click'][user_id] = np.array(click)
    return user

def get_train_input(session,uid_click_talbe,news_index):
    inv_table = {}
    user_id_session = {}

    for uid in uid_click_talbe:
        user_id_session[uid] = []
        for v in uid_click_talbe[uid]:
            inv_table[v] = uid
    
    sess_pos = []
    sess_neg = []
    user_id = []
    for sess_id in range(len(session)):
        sess = session[sess_id]
        _, poss, negs=sess
        for i in range(len(poss)):
            pos = poss[i]
            neg=newsample(negs,npratio)
            sess_pos.append(pos)
            sess_neg.append(neg)
            user_id.append(sess_id)                
            user_id_session[inv_table[sess_id]].append(len(sess_pos)-1)
            
    sess_all = np.zeros((len(sess_pos),1+npratio),dtype='int32')
    label = np.zeros((len(sess_pos),1+npratio))
    for sess_id in range(sess_all.shape[0]):
        pos = sess_pos[sess_id]
        negs = sess_neg[sess_id]
        sess_all[sess_id,0] = news_index[pos]
        index = 1
        for neg in negs:
            sess_all[sess_id,index] = news_index[neg]
            index+=1
        #index = np.random.randint(1+npratio)
        label[sess_id,0]=1
    user_id = np.array(user_id, dtype='int32')
    
    return sess_all, user_id, label, user_id_session

def get_test_input(session,news_index):
    
    Impressions = []
    userid = []
    for sess_id in range(len(session)):
        _, poss, negs = session[sess_id]
        imp = {'labels':[],
                'docs':[]}
        userid.append(sess_id)
        for i in range(len(poss)):
            docid = news_index[poss[i]]
            imp['docs'].append(docid)
            imp['labels'].append(1)
        for i in range(len(negs)):
            docid = news_index[negs[i]]
            imp['docs'].append(docid)
            imp['labels'].append(0)
        Impressions.append(imp)
        
    userid = np.array(userid,dtype='int32')
    
    return Impressions, userid,


================================================
FILE: experiments/fednewsrec/fednewsrec_model.py
================================================
import torch
import torch.nn as nn
import numpy as np

npratio = 4

''' 
    The FedNewsRec model is taken from FedNewsRec-EMNLP-Findings-2020 repository and ported to PyTorch
    framework to be compatible with FLUTE (https://github.com/simra/FedNewsRec#fednewsrec-emnlp-findings-2020). 
    For more information regarding this model, please refer to https://github.com/taoqi98/FedNewsRec.
'''
class AttentivePooling(nn.Module):
    def __init__(self, dim1: int, dim2: int):
        super(AttentivePooling, self).__init__()
        self.dim1 = dim1
        self.dim2 = dim2

        self.dropout = nn.Dropout(0.2)
        self.dense  = nn.Linear(dim2, 200)
        self.tanh = nn.Tanh()
        self.dense2 = nn.Linear(200, 1)
        self.softmax = nn.Softmax(dim=1)
       

    def forward(self, x):
        user_vecs = self.dropout(x)
        user_att = self.tanh(self.dense(user_vecs))
        user_att = self.dense2(user_att).squeeze(2)
        user_att = self.softmax(user_att)
        result = torch.einsum('ijk,ij->ik', user_vecs, user_att)        
        return result

    def fromTensorFlow(self, tfmodel):
        keras_weights = tfmodel.layers[1].get_weights()
        # print(keras_weights)
        self.dense.weight.data = torch.tensor(keras_weights[0]).transpose(0,1).cuda()
        self.dense.bias.data = torch.tensor(keras_weights[1]).cuda()

        keras_weights = tfmodel.layers[2].get_weights()
        # print(keras_weights)
        self.dense2.weight.data = torch.tensor(keras_weights[0]).transpose(0,1).cuda()
        self.dense2.bias.data = torch.tensor(keras_weights[1]).cuda()

class Attention(nn.Module):
 
    def __init__(self, input_dim, nb_head, size_per_head, **kwargs):
        super(Attention, self).__init__(**kwargs)
        #self.input_shape = input_shape
        self.input_dim = input_dim
        self.nb_head = nb_head
        self.size_per_head = size_per_head
        self.output_dim = nb_head*size_per_head
        #self.WQ = nn.Linear(self.input_shape[0][-1], self.output_dim, bias=False)
        #self.WK = nn.Linear(self.input_shape[1][-1], self.output_dim, bias=False)
        #self.WV = nn.Linear(self.input_shape[2][-1], self.output_dim, bias=False)
        self.WQ = nn.Linear(self.input_dim, self.output_dim, bias=False)
        self.WK = nn.Linear(self.input_dim, self.output_dim, bias=False)
        self.WV = nn.Linear(self.input_dim, self.output_dim, bias=False)
        torch.nn.init.xavier_uniform_(self.WQ.weight, gain=np.sqrt(2))
        torch.nn.init.xavier_uniform_(self.WK.weight, gain=np.sqrt(2))
        torch.nn.init.xavier_uniform_(self.WV.weight, gain=np.sqrt(2))
        
    def fromTensorFlow(self, tf, criteria = lambda l: l.name.startswith('attention')):
        for l in tf.layers:
            print(l.name, l.output_shape)
            if criteria(l):
                weights = l.get_weights()
                self.WQ.weight.data = torch.tensor(weights[0].transpose()).cuda()
                self.WK.weight.data = torch.tensor(weights[1].transpose()).cuda()
                self.WV.weight.data = torch.tensor(weights[2].transpose()).cuda()
                

    def forward(self, x):
        if len(x) == 3:
            Q_seq,K_seq,V_seq = x
            Q_len,V_len = None,None
        
        Q_seq = self.WQ(Q_seq)
        Q_seq = torch.reshape(Q_seq, (-1, Q_seq.shape[1], self.nb_head, self.size_per_head))
        #Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
        Q_seq = torch.transpose(Q_seq, 1, 2)
        K_seq = self.WK(K_seq)
        K_seq = torch.reshape(K_seq, (-1, K_seq.shape[1], self.nb_head, self.size_per_head))
        K_seq = torch.transpose(K_seq, 1, 2)
        V_seq = self.WV(V_seq)
        V_seq = torch.reshape(V_seq, (-1, V_seq.shape[1], self.nb_head, self.size_per_head))
        V_seq = torch.transpose(V_seq, 1, 2)
        
        #print('pt shapes')
        #print(Q_seq.shape, K_seq.shape)
        #A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5
        A = torch.einsum('ijkl,ijml->ijkm', Q_seq, K_seq) / self.size_per_head**0.5
        # A = K.permute_dimensions(A, (0,3,2,1))
        # A = self.Mask(A, V_len, 'add')
        # A = K.permute_dimensions(A, (0,3,2,1))
        A = torch.softmax(A, dim=-1)
        #输出并mask
        #O_seq = K.batch_dot(A, V_seq, axes=[3,2])
        O_seq = torch.einsum('ijkl,ijlm->ijkm', A, V_seq)
        #O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
        O_seq = torch.transpose(O_seq, 1,2)
        #O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = torch.reshape(O_seq, (-1, O_seq.shape[1], self.output_dim))
        #O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq
 

class Permute(nn.Module):
    def __init__(self, *dims):
        super(Permute, self).__init__()
        self.dims = dims
    
    def forward(self, x):
        return x.permute(*self.dims)

class SwapTrailingAxes(nn.Module):
    def __init__(self):
        super(SwapTrailingAxes, self).__init__()
        
    def forward(self, x):        
        return x.transpose(-2, -1)

class DocEncoder(nn.Module):
    def __init__(self):        
        super(DocEncoder,self).__init__()
        self.phase1 = nn.Sequential(
            nn.Dropout(0.2),
            # TODO: why we need the SwapTrailingAxes here?
            SwapTrailingAxes(),            
            nn.Conv1d(300, 400, 3),
            nn.ReLU(),
            nn.Dropout(0.2),
            # TODO: seems here we swap the dimension back. why?
            SwapTrailingAxes()
        )

        #self.attention = nn.MultiheadAttention(400, 20, batch_first=True)
        self.attention = Attention(400, 20, 20)
        # Pytorch MultiheadAttention has in_proj_weight of size (3*embed_dim, embed_dim)
        # Thus, we need to scale the xavier by sqrt(2)
        #torch.nn.init.xavier_uniform_(self.attention.in_proj_weight, gain=np.sqrt(2))
        self.phase2 = nn.Sequential(
            nn.ReLU(),
            nn.Dropout(0.2),
            AttentivePooling(30,400)
        )

    def fromTensorFlow(self, tfDoc):
        print('td')
        for l in self.phase1:
            if 'conv' in l._get_name().lower():
                print('conv shape:',l.weight.data.shape, l.bias.data.shape)
            #print('\t',[p[0] for p in l.named_parameters()])
        
                for lt in tfDoc.layers:
                    print(lt.name, lt.output_shape)
                    if 'conv' in lt.name.lower():
                        weights = lt.get_weights()
                        l.weight.data = torch.tensor(weights[0]).transpose(0,2).cuda()
                        l.bias.data = torch.tensor(weights[1]).cuda()
                        #print(len(l.get_weights()), [p.shape for p in l.get_weights()])
                        break
                break

        #for lt in tfDoc.layers:
        #    print('tf2')
        #    print(lt.name, lt.output_shape)
        #    if 'attention' in lt.name:
        # TODO: we should just pass the specific layer
        self.attention.fromTensorFlow(tfDoc)

        print('phase2')
        for l in self.phase2:
            if 'attentive' in l._get_name().lower():
                for lt in tfDoc.layers:
                    print(lt.name)
                    if 'model' in lt.name.lower():
                        print('copying attentive pooling')
                        l.fromTensorFlow(lt)

        
    def forward(self, x):
        # print(x.shape)
        l_cnnt = self.phase1(x)
        # print('doc_encoder:phase1',l_cnnt.shape)
        l_cnnt = self.attention([l_cnnt]*3)
        # print('doc_encoder:attention', l_cnnt.shape)
        result = self.phase2(l_cnnt)
        # print('doc_encoder:phase2', result.shape)
        return result


class VecTail(nn.Module):
    def __init__(self, n):
        super(VecTail, self).__init__()
        self.n = n

    def forward(self, x):
        return x[:,-self.n:,:]

class UserEncoder(nn.Module):
    def __init__(self):        
        super(UserEncoder,self).__init__()
        # news_vecs_input = Input(shape=(50,400), dtype='float32')
        #self.dropout1 = nn.Dropout(0.2)
        #self.tail = VecTail(15)
        #self.gru = nn.GRU(400, 400)
        #self.attention = nn.MultiheadAttention(400, 20)
        #self.pool = AttentivePooling(50, 400)
        #self.attention2 = nn.MultiheadAttention(400, 20, batch_first=True)
        self.attention2 = Attention(400, 20, 20)
        #torch.nn.init.xavier_uniform_(self.attention2.in_proj_weight, gain=np.sqrt(2))
        self.dropout2 = nn.Dropout(0.2)
        self.pool2 = AttentivePooling(50, 400)
        self.tail2 = VecTail(20)
        #TODO: what is batch_first?
        self.gru2 = nn.GRU(400,400, bidirectional=False, batch_first=True)
        self.pool3 = AttentivePooling(2, 400)

    def forward(self, news_vecs_input):    
        #news_vecs =self.dropout1(news_vecs_input)
        #gru_input = self.tail(news_vecs)
        #vec1 = self.gru(gru_input)
        #vecs2 = self.attention(*[news_vecs]*3)
        #vec2 = self.pool(vecs2)
        # print('news_vecs_input', news_vecs_input.shape)
        user_vecs2 = self.attention2([news_vecs_input]*3)
        user_vecs2 = self.dropout2(user_vecs2)
        user_vec2 = self.pool2(user_vecs2)
        # print('pool2_user_vec2', user_vec2.shape)
        #user_vec2 = keras.layers.Reshape((1,400))(user_vec2)
        #user_vec2 = user_vec2.unsqueeze(1)

        user_vecs1 = self.tail2(news_vecs_input)
        # print('tail2_user_vecs1', user_vecs1.shape)
        self.gru2.flatten_parameters()
        user_vec1, _u_hidden = self.gru2(user_vecs1)
        # print('gru2_user_vec1', user_vec1.shape)
        # TODO: does this flatten the second dimension? print out the shape to check
        user_vec1 = user_vec1[:, -1, :]
        #user_vec1 = keras.layers.Reshape((1,400))(user_vec1)
        #user_vec1 = user_vec1.unsqueeze(1)
        
        user_vecs = torch.stack([user_vec1, user_vec2], dim=1) #keras.layers.Concatenate(axis=-2)([user_vec1,user_vec2])
        # print(user_vecs.shape)
        vec = self.pool3(user_vecs)
        # print(vec.shape)
        return vec

    def fromTensorFlow(self, tfU):
        for l in tfU.layers:
            print(l.name, l.output_shape)
            if l.name == 'model_1':
                self.pool2.fromTensorFlow(l)
            elif l.name == 'model_2':
                self.pool3.fromTensorFlow(l)
            elif l.name=='gru_1':                              
                print(len(l.get_weights()), [p.shape for p in l.get_weights()])
                weights = l.get_weights()
                for p in self.gru2.named_parameters():
                    s1 = p[1].data.shape
                    if p[0] == 'weight_ih_l0':                        
                        p[1].data = torch.tensor(weights[0]).transpose(0,1).contiguous().cuda()
                    elif p[0] == 'weight_hh_l0':
                        p[1].data = torch.tensor(weights[1]).transpose(0,1).contiguous().cuda()
                    elif p[0] == 'bias_ih_l0':
                        p[1].data = torch.tensor(weights[2]).cuda()
                    elif p[0] == 'bias_hh_l0':
                        p[1].data = torch.zeros(p[1].data.shape).cuda()
                    print(p[0], s1, p[1].shape)
        self.attention2.fromTensorFlow(tfU)
        # TODO: GRU
        
            
class TimeDistributed(nn.Module):    
    def __init__(self, module): #, batch_first=False):
        super(TimeDistributed, self).__init__()
        self.module = module
        # self.batch_first = batch_first

    def forward(self, x):
        # print('TimeDist_x',x.size())
        if len(x.size()) <= 2:
            return self.module(x)

        output = torch.tensor([]).cuda(x.get_device())
        for i in range(x.size(1)):
          output_t = self.module(x[:, i, :, :])
          output_t  = output_t.unsqueeze(1)
          output = torch.cat((output, output_t ), 1)
          # print('TimeDist_output', output.size())
        return output
        # # Squash samples and timesteps into a single axis
        # x_reshape = x.contiguous().view(x.size(0), -1, x.size(-1))  # (samples * timesteps, input_size)
        #print('TimeDist_x_reshape',x_reshape.shape)
        # y = self.module(x_reshape)
        # print('TimeDist_y', y.shape)
        # # We have to reshape Y
        # if self.batch_first:
        #     y = y.contiguous().view(x.size(0), -1, y.size(-1))  # (samples, timesteps, output_size)
        # else:
        #    y = y.view(-1, x.size(1), y.size(-1))  # (timesteps, samples, output_size)
        # print('TimeDist_y_reshape',y.size())
        #return y

class FedNewsRec(nn.Module):
    def __init__(self, title_word_embedding_matrix):
        super(FedNewsRec, self).__init__()
        self.doc_encoder = DocEncoder() 
        self.user_encoder = UserEncoder()
        self.title_word_embedding_layer = nn.Embedding.from_pretrained(torch.tensor(title_word_embedding_matrix, dtype=torch.float), freeze=True)
    
        # click_title = Input(shape=(50,30),dtype='int32')
        # can_title = Input(shape=(1+npratio,30),dtype='int32')
    
        self.softmax = nn.Softmax(dim=1)
        self.click_td = TimeDistributed(self.doc_encoder) #, batch_first=True)
        self.can_td = TimeDistributed(self.doc_encoder) #, batch_first=True)
        
    def forward(self, click_title, can_title):
        click_word_vecs = self.title_word_embedding_layer(click_title)
        # print('click', click_word_vecs.shape, click_word_vecs.type)
        can_word_vecs = self.title_word_embedding_layer(can_title)
        # print('can', can_word_vecs.shape, can_word_vecs.type)
        click_vecs = self.click_td(click_word_vecs)
        # print('click_vecs (None, 50, 400)', click_vecs.shape)
        can_vecs = self.can_td(can_word_vecs)
        # print('can_vecs (None, 5, 400)', can_vecs.shape)
    
        user_vec = self.user_encoder(click_vecs)        
        # print('user_vec (None, 400)', user_vec.shape)
        # TODO verify
        scores = torch.einsum('ijk,ik->ij',  can_vecs, user_vec)
        #if verbose:            
        #    print('model scores:', scores.detach().cpu().numpy())
        # print('scores  (None, 5)', scores.shape)
        #logits = self.softmax(scores)     
        # pytorch crossentropyloss function accepts unnormalized scores.
        logits = scores
        # print('logits  (None, 5)', logits.shape)
        
        #news_word_vecs = self.title_word_embedding_layer(news_input)
        #news_vec = self.doc_encoder(news_word_vecs)
        
        # print('user_vec', user_vec.shape)
        # print('news_vec', news_vec.shape)        
        return logits, user_vec #, news_vec

    def news_encoder(self, news_title):
        news_word_vecs = self.title_word_embedding_layer(news_title)
        news_vec = self.doc_encoder(news_word_vecs)
        return news_vec


================================================
FILE: experiments/fednewsrec/model.py
================================================
import os
import torch
from torch.nn import CrossEntropyLoss
from torch.nn import functional as F
import numpy as np
from sklearn.metrics import roc_auc_score
from nltk.tokenize import word_tokenize

from core.model import BaseModel
from experiments.fednewsrec.utils import ndcg_score, mrr_score
from experiments.fednewsrec.fednewsrec_model import FedNewsRec

''' 
    The FedNewsRec model is taken from FedNewsRec-EMNLP-Findings-2020 repository and ported to PyTorch
    framework to be compatible with FLUTE (https://github.com/simra/FedNewsRec#fednewsrec-emnlp-findings-2020). 
    For more information regarding this model, please refer to https://github.com/taoqi98/FedNewsRec.
'''

class FEDNEWS(BaseModel):
    '''This is a PyTorch model with some extra methods'''

    def __init__(self, model_config):
        super().__init__()

        root_data_path = model_config['embbeding_path']
        embedding_path = model_config['embbeding_path']

        news,news_index,category_dict,subcategory_dict,word_dict = self.read_news(root_data_path,['train','val'])
        title_word_embedding_matrix, _ = self.load_matrix(embedding_path,word_dict)
        self.net = FedNewsRec(title_word_embedding_matrix)

    def loss(self, input: torch.Tensor) -> torch.Tensor:
        '''Performs forward step and computes the loss'''

        if not self.net.training:
            return torch.tensor(0) # Not using the loss during evaluation
            
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        (click, sample), label = input['x'], input['y']
        click = click.to(device)
        sample = sample.to(device)
        label = label.to(device)
        criterion = CrossEntropyLoss()
        output, _ = self.net.forward(click, sample)
        return criterion(output, label)

    def inference(self, input):
        '''Performs forward step and computes metrics'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        (nv_hist, nv_imp), labels = input['x'], input['y']
        nv_hist = nv_hist.to(device)
        nv_imp = nv_imp.to(device)

        nv = self.net.news_encoder(nv_imp).detach().cpu().numpy()  # news vector?
        nv_hist = self.net.news_encoder(nv_hist)
        uv = self.net.user_encoder(nv_hist.unsqueeze(0)).detach().cpu().numpy()[0] # user vector?

        score = np.dot(nv,uv)
        auc = roc_auc_score(labels,score)
        mrr = mrr_score(labels,score)
        acc = ndcg_score(labels,score,k=1)
        ndcg5 = ndcg_score(labels,score,k=5)
        ndcg10 = ndcg_score(labels,score,k=10)

        return {'output':None, 'acc': acc, 'batch_size': 1, \
                'auc': {'value':auc,'higher_is_better': True},
                'mrr': {'value':mrr,'higher_is_better': True},
                'ndcg5': {'value':ndcg5,'higher_is_better': True},
                'ndcg10': {'value':ndcg10,'higher_is_better': True}} 

    def read_news(self, root_data_path, modes):
        news={}
        category=[]
        subcategory=[]
        news_index={}
        index=1
        word_dict={}
        word_index=1
        
        for mode in modes:
            with open(os.path.join(root_data_path,mode,'news.tsv'), encoding="utf8") as f:
                lines = f.readlines()
            for line in lines:
                splited = line.strip('\n').split('\t')
                doc_id,vert,subvert,title= splited[0:4]
                if doc_id in news_index:
                    continue
                news_index[doc_id]=index
                index+=1
                category.append(vert)
                subcategory.append(subvert)
                title = title.lower()
                title=word_tokenize(title)
                news[doc_id]=[vert,subvert,title]
                for word in title:
                    word = word.lower()
                    if not(word in word_dict):
                        word_dict[word]=word_index
                        word_index+=1
        category=list(set(category))
        subcategory=list(set(subcategory))
        category_dict={}
        index=1
        for c in category:
            category_dict[c]=index
            index+=1
        subcategory_dict={}
        index=1
        for c in subcategory:
            subcategory_dict[c]=index
            index+=1
        return news,news_index,category_dict,subcategory_dict,word_dict
    
    def load_matrix(self, embedding_path,word_dict):
        embedding_matrix = np.zeros((len(word_dict)+1,300))
        have_word=[]
        with open(os.path.join(embedding_path,'glove.840B.300d.txt'),'rb') as f:
            while True:
                l=f.readline()
                if len(l)==0:
                    break
                l=l.split()
                word = l[0].decode()
                if word in word_dict:
                    index = word_dict[word]
                    tp = [float(x) for x in l[1:]]
                    embedding_matrix[index]=np.array(tp)
                    have_word.append(word)
        return embedding_matrix,have_word
            

================================================
FILE: experiments/fednewsrec/utils.py
================================================
import numpy as np

def mrr_score(y_true, y_score):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)
    rr_score = y_true / (np.arange(len(y_true)) + 1)
    return np.sum(rr_score) / np.sum(y_true)

def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best

def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

================================================
FILE: experiments/mlm_bert/README.md
================================================
# Simple example of a MLM task on Reddit Dataset

Instructions on how to run the experiment, given below.

## Preparing the data

For this experiment, we can create a dummy dataset by running the 
script located in `testing/create_data.py` as follows:

```code
    python create_data.py --task mlm_bert
```

A couple of scripts are provided in `utils/preprocessing` for preprocessing .tsv files
in case you want to use your own data.

## Creating a config file

All the parameters of the experiment are passed in a YAML file. An example is
provided in `configs/hello_world_mlm_bert_json.yaml` with the suggested parameters
to do a simple run for this experiment. Make sure to point your training files at
the fields: list_of_train_data, test_data and val_data inside the config file.

## Running the experiment locally

Finally, to launch the experiment, it suffices to launch the `e2e_trainer.py`
script using torch.distributed:

```code
    python -m torch.distributed.run --nproc_per_node=2 .\e2e_trainer.py -dataPath data_folder -outputPath scratch -config configs\hello_world_mlm_bert_json.yaml -task mlm_bert -backend nccl
```

For submitting jobs in Azure ML, we have included the instructions in the `Experiments` 
section of the main `README.md`.

================================================
FILE: experiments/mlm_bert/config.py
================================================
from __future__ import annotations
from dataclasses import dataclass
import sys
sys.path.append('../../')
from core.config import ModelConfig, Config, from_dict


@dataclass
class BERTModelConfig(Config):
    """BERT model configuration

The BERT configuration specifies huggingface-specific BERT model settings.

Attributes:
    model_name (str): The name of the BERT model.  eg bert-base-uncased.

    cache_dir (str): Tokenizer cache directory, will be created if it doesn't exist.

    use_fast_tokenizer (bool): Whether to use the fast tokenizer.

    mask_token (str): special token to use for masking.

    task (str): The task to use for BERT.  eg mlm.

    past_index (int): The index of the past state in the BERT model's state dict.

    prediction_loss_only (bool): if False, also produce metrics for predictions and labels.

    process_line_by_line (bool): if True, process the input line-by-line.

ToDo:
    * check how cache_dir is used- there's a risk of multiple processes reading/writing at the same time.
    * verify the meaning of past_index (thanks copilot)
    * document the difference when process_line_by_line is True vs False

    """
    model_name: str = None
    cache_dir: str = None
    use_fast_tokenizer: bool = False
    mask_token: str = '<mask>'
    task: str = 'mlm'
    past_index: int | None = -2
    prediction_loss_only: bool = False
    process_line_by_line: bool = False

    @staticmethod
    def from_dict(config) -> BERTModelConfig:
        return from_dict(BERTModelConfig, config)


@dataclass
class BERTTrainingConfig(Config):
    """BERT training configuration

    Configuration settings for BERT training.

    Attributes:
        seed (int): random seed for reproducibility.

        label_smoothing_factor (float): label smoothing factor.  Applied label smoothing when the factor is non-zero.

        batch_size (int): batch size.

        max_seq_length (int): maximum input sequence length.
    """
    seed: int | None = None
    label_smoothing_factor: float | None = None
    batch_size: int | None = None
    max_seq_length: int | None = None

    @staticmethod
    def from_dict(config) -> BERTTrainingConfig:
        return from_dict(BERTTrainingConfig, config)


@dataclass
class BERTSpecificConfig(Config):
    """BERT configuration
    Specifies the model and training configuration for huggingface modeling scenarios.

    Attributes:
        loader_type (str): loader type hint. eg 'text'

        model (BERTModelConfig): BERT model configuration.

        training (BERTTrainingConfig): BERT training configuration.
    """
    loader_type: str = None
    model: BERTModelConfig = None
    training: BERTTrainingConfig = None

    @staticmethod
    def from_dict(config) -> BERTSpecificConfig:
        result = BERTSpecificConfig()
        for k in config:
            if k == 'model':
                result.model = BERTModelConfig.from_dict(config[k])
            elif k == 'training':
                result.training = BERTTrainingConfig.from_dict(config[k])
            else:
                setattr(result, k, config[k])
        return result


@dataclass
class BERTConfig(ModelConfig):
    """
    Expected MLM config wraps the BERTSpecificConfig as a sub-field of the ModelConfig.
    """
    BERT: BERTSpecificConfig = None

    @staticmethod
    def from_dict(config) -> ModelConfig:
        result = BERTConfig()
        for k in config:
            if k=="BERT":
                result.BERT = BERTConfig.from_dict(config[k])
            else:
                setattr(result, k, config[k])
        return result


================================================
FILE: experiments/mlm_bert/dataloaders/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from transformers.data.data_collator import default_data_collator, DataCollatorWithPadding
from torch.utils.data import RandomSampler, SequentialSampler
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from experiments.mlm_bert.dataloaders.dataset import Dataset
from core.dataloader import BaseDataLoader
from utils import print_rank
import logging

class DataLoader(BaseDataLoader):
    """
    PyTorch dataloader for loading text data from
    text_dataset.
    """
    def __init__(self, mode, data, num_workers=0,  **kwargs):

        args = kwargs['args']
        task = args['task']
        user_idx = kwargs['user_idx']
        mlm_probability = args['mlm_probability']
        self.batch_size = args['batch_size']
        self.mode = mode
        self.num_workers = num_workers
        self.utt_ids = None
        max_samples_per_user = args.get('max_samples_per_user', -1)
        min_words_per_utt = args.get('min_words_per_utt', 5)
        tokenizer_kwargs = {
                            "cache_dir": args['cache_dir'],
                            "use_fast": args['tokenizer_type_fast'],
                            "use_auth_token":  None
                        }                     
        
        if 'tokenizer_name' in args:
            tokenizer = AutoTokenizer.from_pretrained(args['tokenizer_name'], **tokenizer_kwargs)
        elif 'model_name_or_path' in args:
            tokenizer = AutoTokenizer.from_pretrained(args['model_name_or_path'], **tokenizer_kwargs)
        else:
            raise ValueError("You are instantiating a new tokenizer from scratch. This is not supported by this script.")

        print_rank("Tokenizer is: {}".format(tokenizer), loglevel=logging.DEBUG)
        
        dataset = Dataset(
                                data,
                                args= args,
                                test_only = self.mode is not 'train',
                                tokenizer= tokenizer,
                                user_idx=user_idx,
                                max_samples_per_user=max_samples_per_user,
                                min_words_per_utt=min_words_per_utt,
                              )
        self.utt_ids = dataset.user

        try:
            data_collator = DataCollatorForLanguageModeling(
                                                    tokenizer=tokenizer,
                                                    mlm= task=='mlm',
                                                    mlm_probability=mlm_probability,)
        except:

            print('There is an issue with the DataCollator .. Falling back to default_data_collator')
            data_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer)

        if self.mode == 'train':
            train_sampler = RandomSampler(dataset)
            super(DataLoader, self).__init__(
                                            dataset,
                                            batch_size=self.batch_size,
                                            sampler=train_sampler,
                                            collate_fn=data_collator,
                                            drop_last=False,
                                            num_workers=self.num_workers,
                                            pin_memory=True,
                                            )
                                            
        elif self.mode == 'val' or self.mode == 'test':
            eval_sampler = SequentialSampler(dataset)
            super(DataLoader, self).__init__(
                                            dataset,
                                            sampler=eval_sampler,
                                            batch_size= self.batch_size,
                                            collate_fn=data_collator,
                                            drop_last=False,
                                            num_workers=self.num_workers,
                                            pin_memory=True,
                                            )

        else:
            raise Exception("Sorry, there is something wrong with the 'mode'-parameter ")

    def get_user(self):
        return self.utt_ids
    

================================================
FILE: experiments/mlm_bert/dataloaders/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from core.dataset import BaseDataset
from transformers import AutoTokenizer
from utils import print_rank
import logging
import json
import itertools

class Dataset(BaseDataset):
    """
    Map a text source to the target text
    """

    def __init__(self, data, args, tokenizer=None, test_only=False, user_idx=0, max_samples_per_user=-1, min_words_per_utt=5, **kwargs):

        self.utt_list = list()
        self.test_only= test_only
        self.padding = args.get('padding', True)
        self.max_seq_length= args['max_seq_length']
        self.max_samples_per_user = max_samples_per_user
        self.min_num_words = min_words_per_utt
        self.process_line_by_line=args.get('process_line_by_line', False)
        self.user = None

        if tokenizer != None:
            self.tokenizer = tokenizer
        else:
            tokenizer_kwargs = {
                    "cache_dir": args['cache_dir'],
                    "use_fast": args['tokenizer_type_fast'],
                    "use_auth_token":  None
                }                     
        
            if 'tokenizer_name' in args:
                self.tokenizer = AutoTokenizer.from_pretrained(args['tokenizer_name'], **tokenizer_kwargs)
            elif 'model_name_or_path' in args:
                self.tokenizer = AutoTokenizer.from_pretrained(args['model_name_or_path'], **tokenizer_kwargs)
            else:
                raise ValueError("You are instantiating a new tokenizer from scratch. This is not supported by this script.")

        if self.max_seq_length is None:
            self.max_seq_length = self.tokenizer.model_max_length
            if self.max_seq_length > 512:
                print_rank(
                    f"The tokenizer picked seems to have a very large `model_max_length` ({self.tokenizer.model_max_length}). "
                    "Picking 512 instead. You can change that default value by passing --max_seq_length xxx.", loglevel=logging.DEBUG
                )
                self.max_seq_length = 512
        else:
            if self.max_seq_length > self.tokenizer.model_max_length:
                print_rank(
                    f"The max_seq_length passed ({self.max_seq_length}) is larger than the maximum length for the"
                    f"model ({self.tokenizer.model_max_length}). Using max_seq_length={self.tokenizer.model_max_length}.", loglevel=logging.DEBUG
                )
            self.max_seq_length = min(self.max_seq_length, self.tokenizer.model_max_length)

        self.load_data(data, user_idx)

        if user_idx != -1: # Avoid loading unnecessary data on memory before training
            if not self.process_line_by_line:
                self.post_process_list()


    def __len__(self):
        return len(self.utt_list)

    def __getitem__(self, idx):
        # Find the index in the available data
        if self.process_line_by_line:
            tokenized_text = LineByLineTextDataset(
                                tokenizer=self.tokenizer,
                                input_lines=self.utt_list[idx]['src_text'],
                                line_by_line=True,
                                truncation=True,
                                max_length=self.max_seq_length,
                                padding="max_length")

            self.utt_list[idx]['duration']= len(tokenized_text['input_ids'])
            return tokenized_text
        else:
            return self.utt_list[idx]


    def load_data(self, orig_strct, user_idx):
        """ Reads the data for a specific user (unless it's for val/testing) and returns a 
        list of embeddings and targets."""

        if isinstance(orig_strct, str):
            print('Loading json-file: ', orig_strct)
            with open(orig_strct, 'r') as fid:
                orig_strct = json.load(fid)

        self.user_list  = orig_strct['users']
        self.num_samples= orig_strct['num_samples']
        self.user_data  = orig_strct['user_data']

        if user_idx != -1: # Avoid loading unnecessary data on memory before training
            if self.test_only:
                self.user = 'test_only'
                self.process_x(self.user_data)
            else:
                self.user = self.user_list[user_idx]
                self.process_x(self.user_data[self.user])

    def process_x(self, raw_x_batch):

        if self.test_only:
            for i, user in enumerate(self.user_list):
                counter=self.process_user(user, raw_x_batch[user])
                self.num_samples[i] = counter # Update userdata counter "num_samples[user]" after truncation
        else:
            counter = self.process_user(self.user, raw_x_batch)
            self.num_samples[self.user_list.index(self.user)] = counter # Update userdata counter "num_samples[user]" after truncation

        if len(self.utt_list) == 0:
            self.utt_list = [{'src_text': 'N/A', 'duration': 0, 'loss_weight': 1.0}]

        print_rank('Processing json-structure for User: {} Utterances Processed: {}'.format(self.user, len(self.utt_list)), loglevel=logging.DEBUG)

    def process_user(self, user, user_data):
        counter=0
        for line in user_data:
            for e in line:
                if len(e.split()) < self.min_num_words:
                    continue
                if self.max_samples_per_user > -1 and counter >= self.max_samples_per_user:
                    print_rank('Max allowed size per user is reached for user: {},  N: {} utts,  Utt_list Len: {}' \
                               .format(user, counter, len(self.utt_list)), loglevel=logging.DEBUG)
                    return counter
                counter += 1

                utt = {}
                utt['src_text'] = e
                utt['duration'] = len(e.split())
                utt['loss_weight'] = 1.0
                self.utt_list.append(utt)
        return counter


    def post_process_list(self):

        # Use only the text part of the dataset
        input_lines=[line['src_text'] for line in self.utt_list]

        # Process all lines of text
        print_rank('Tokenizing {} Utterances'.format(len(input_lines)), loglevel=logging.DEBUG)
        self.utt_list= LineByLineTextDataset(self.tokenizer, input_lines) #this one has return_special_tokens_mask as True
        
        def group_texts(examples):
            """"Main data processing function that will concatenate all texts
            from our dataset and generate chunks of max_seq_length."""
            
            print_rank('Concatenating Frames in Sequences of {} samples'.format(self.max_seq_length), loglevel=logging.DEBUG)

            if self.padding: # Padding last frame

                total_length = sum([len(k) for k in examples['input_ids']])
                print_rank('Found {} samples Before Concatenation'.format(total_length), loglevel=logging.DEBUG)
                padN= self.max_seq_length - (total_length % self.max_seq_length)
                print_rank('Padding last frame with {} samples'.format(padN), loglevel=logging.DEBUG)
                print_rank('keys {}'.format(examples.keys()), loglevel=logging.DEBUG)
                examples['input_ids'].append([self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token)]*padN) 
                examples['attention_mask'].append([0]*padN)
                
                if 'special_tokens_mask' in examples.keys():
                    examples['special_tokens_mask'].append([1]*padN)

                if 'token_type_ids' in examples.keys():
                    examples['token_type_ids'].append([0]*padN)

       
            # Concatenate all input.
            concatenated_examples = {k: list(itertools.chain.from_iterable(examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            print_rank('Concatenated in {} Samples'.format(total_length), loglevel=logging.DEBUG)
            total_length = (total_length // self.max_seq_length) * self.max_seq_length
            print_rank('Concatenated in {} Frames'.format(total_length // self.max_seq_length), loglevel=logging.DEBUG)

            # Split by chunks of max_len
            self.utt_list=[]
            for i in range(0, total_length, self.max_seq_length):
                utt={}
                for k, t in concatenated_examples.items():
                    utt[k]= t[i : i + self.max_seq_length]
                self.utt_list.append(utt)
                print_rank('Utterance Len is: {}'.format(len(utt['input_ids'])),loglevel=logging.DEBUG)
                
        # Process list of text
        group_texts(self.utt_list) 

        total_length = len(self.utt_list)
        print_rank('Finished Reshaping in Sequences of {} Frames'.format(total_length), loglevel=logging.INFO)

        # Update userdata after truncation
        if not self.test_only:
            self.num_samples[self.user_list.index(self.user)] = total_length

        # Not used anywhere but necessary when the dataset is initiated
        if total_length == 0:
            self.utt_list = [{"input_ids": [0, 2], "special_tokens_mask": [1, 1], "attention_mask": [0, 0]}]

def LineByLineTextDataset(tokenizer, input_lines, truncation=True, max_length=512, padding = False, line_by_line=False):

    if input_lines==['N/A']:
        batch_encoding = {"input_ids": [[0, 2]], "special_tokens_mask": [[1, 1]], "attention_mask": [[0, 0]]}
    else:
        lines = [line for line in input_lines if (len(line) > 0 and not line.isspace())]
        print_rank ('padding is : ' + str(padding),loglevel=logging.DEBUG)
        print_rank ('max_length is : ' + str(max_length),loglevel=logging.DEBUG)
        batch_encoding = tokenizer(lines, truncation=truncation, max_length=max_length, padding = padding, return_special_tokens_mask=True,)
    if line_by_line:
        batch_encoding["input_ids"] = batch_encoding["input_ids"][0]
        batch_encoding["special_tokens_mask"] = batch_encoding["special_tokens_mask"][0]
        batch_encoding["attention_mask"] = batch_encoding["attention_mask"][0]

    return batch_encoding

================================================
FILE: experiments/mlm_bert/model.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch as T
from utils import print_rank
import logging
import copy
from typing import (Dict, 
                    List, 
                    Optional, 
                    Tuple, 
                    Union)

from experiments.mlm_bert.utils.trainer_pt_utils import (
    LabelSmoother,
    DistributedTensorGatherer,
    nested_concat,
    nested_detach,
    nested_numpify,
)

from experiments.mlm_bert.utils.trainer_utils import (
    EvalPrediction,
    ComputeMetrics)

from transformers import (
                    MODEL_FOR_MASKED_LM_MAPPING,
                    AutoConfig,
                    AutoModelForMaskedLM,
                    AutoTokenizer,
                    set_seed,
)
from utils.utils import to_device
from core.model import BaseModel

MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

class BERT(BaseModel):
    def __init__(self, model_config, **kwargs):
        super(BERT, self).__init__()
        """
            from transformers import RobertaConfig
            config = RobertaConfig(
                        vocab_size=52_000,
                        max_position_embeddings=514,
                        num_attention_heads=12,
                        num_hidden_layers=6,
                        type_vocab_size=1,
            )

            from transformers import RobertaTokenizerFast
            tokenizer = RobertaTokenizerFast.from_pretrained("./EsperBERTo", max_len=512)

            from transformers import RobertaForMaskedLM
            model = RobertaForMaskedLM(config=config)
        """

        # Extracting model_config['BERT']
        args = model_config['BERT']
        # Split data to smaller configuration parameters
        model_args, training_args = args['model'], args['training']

        # Set seed before initializing model.
        set_seed(training_args['seed'])

        self.gradient_accumulation_steps =  model_args.get('gradient_accumulation_steps', 1)
        self.past_index = model_args.get('past_index', -1)
        self.prediction_loss_only = model_args.get('prediction_loss_only', True)
        self.eval_accumulation_steps = model_args.get('eval_accumulation_steps', None)
        self.label_names = model_args.get('label_names', None)
        self.batch_size= training_args['batch_size']
        self.model_name=model_args['model_name']

        if 'model_name_or_path' not in model_args:
            model_args['model_name_or_path']=self.model_name

        # Label smoothing
        if training_args['label_smoothing_factor'] != 0:
            self.label_smoother = LabelSmoother(epsilon=training_args['label_smoothing_factor'])
        else:
            self.label_smoother = None
        self.label_names = ( ["labels"]) if self.label_names is None else self.label_names

        config_kwargs = {
                        "cache_dir": model_args['cache_dir'],
                        "revision": None,
                        "use_auth_token": None,
                    }

        if 'config_name' in model_args:
            config = AutoConfig.from_pretrained(model_args['config_name'], **config_kwargs)
        elif 'model_name_or_path' in model_args:
            config = AutoConfig.from_pretrained(model_args['model_name_or_path'], **config_kwargs)
        else:
            raise ValueError(
                "You are instantiating a new configuration from scratch. This is not supported by this script."
            )


        tokenizer_kwargs = {
                            "cache_dir": model_args['cache_dir'],
                            "use_fast": model_args['use_fast_tokenizer'],
                            "use_auth_token":  None,
                        }
        if 'tokenizer_name' in model_args:
            tokenizer = AutoTokenizer.from_pretrained(model_args['tokenizer_name'], **tokenizer_kwargs)
        elif 'model_name_or_path' in model_args:
            print('Loading Tokenizer from Pretrained: {}'.format(model_args['model_name_or_path']) )
            tokenizer = AutoTokenizer.from_pretrained(model_args['model_name_or_path'], **tokenizer_kwargs)
        else:
            raise ValueError(
                "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            )
        self.output_layer_size=len(tokenizer)

        if 'model_name_or_path' in model_args:
            print('Loading Model from Pretrained: {}'.format(model_args['model_name_or_path']) )
            self.model = AutoModelForMaskedLM.from_pretrained(
                                                    model_args['model_name_or_path'],
                                                    from_tf=False,
                                                    config=config,
                                                    cache_dir=model_args['cache_dir'],
                                                    use_auth_token=None,
                                                )
            if 'adapter' in model_args:
                if model_args['adapter']:
                    self.model.add_adapter("FLUTE")

                    #Activate the adapter
                    self.model.train_adapter("FLUTE")

        else:
            raise ValueError(
                "You are instantiating a new model from scratch. This is not supported by this script."
            )
        self.model.resize_token_embeddings(self.output_layer_size) 
        total_params = 0
        trainable_params = 0

        for p in self.model.parameters():
            total_params += p.numel()
            if p.requires_grad: 
                trainable_params += p.numel()

        print_rank(f"Total parameters count: {total_params}", loglevel=logging.DEBUG) # ~109M
        print_rank(f"Trainable parameters count: {trainable_params}", loglevel=logging.DEBUG) # ~1M
        print_rank(f"Original Bert parameters count: {total_params-trainable_params}", loglevel=logging.DEBUG) # ~1M
        

    def copy_state_dict(self, state_dict):
        self.model.state_dict=state_dict.clone()

    def get_model(self):
        return self.model


    def _prepare_inputs(self, inputs):
        """
        Prepare :obj:`inputs` before feeding them to the model, converting them to tensors if they are not already and
        handling potential state.
        """
        for k, v in inputs.items():
            if isinstance(v, T.Tensor):
                inputs[k] = to_device(v)
        if self.past_index >= 0 and self._past is not None:
            inputs["mems"] = self._past

        return inputs


    def forward(self, inputs):
        inputs = self._prepare_inputs(inputs)
        return self.model(**inputs)


    def loss(self, inputs):
        """
        Perform a training step on a batch of inputs.
        Subclass and override to inject custom behavior.
        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[T.Tensor, Any]]`):
                The inputs and targets of the model.
                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
        Return:
            :obj:`T.Tensor`: The tensor with training loss on this batch.
        """
        inputs = self._prepare_inputs(inputs)

        loss = self.compute_loss(inputs)
        loss = loss / self.gradient_accumulation_steps

        return loss


    def compute_loss(self, inputs_orig, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.

        inputs (:obj:`Dict[str, Union[T.Tensor, Any]]`):
                The inputs and targets of the model.
                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
        """
        # Copy a local copy of the data
        inputs=copy.deepcopy(inputs_orig)

        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs["labels"].detach().cpu()
        else:
            labels = None

        # The following fields need to be removed for Roberta
        if 'roberta'  in self.model_name:
            #print("here")
            if 'attention_mask' in inputs:
                inputs.pop('attention_mask')
            if 'special_tokens_mask' in inputs:
                inputs.pop('special_tokens_mask')


        # Forward pass for the transformer
        outputs = self.model(**inputs)

        if self.past_index >= 0:
            self._past = outputs[self.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        return (loss, outputs) if return_outputs else loss


    def inference(
            self, inputs, ignore_keys: Optional[List[str]] = [], metric_key_prefix: str = "eval"
    ) -> List[float]:
        """
        Run prediction and returns predictions and potential metrics.
        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
        will also return metrics, like in :obj:`evaluate()`.
        Args:
            inputs (:obj:`Dict[str, Union[T.Tensor, Any]]`):
                The inputs and targets of the model.
                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
            argument :obj:`labels`. Check your model's documentation for all accepted arguments.
                            ignore_keys (:obj:`Lst[str]`, `optional`):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.
            metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`):
                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                "eval_bleu" if the prefix is "eval" (default)
        .. note::
            If your predictions or labels have different sequence length (for instance because you're doing dynamic
            padding in a token classification task) the predictions will be padded (on the right) to allow for
            concatenation into one array. The padding index is -100.
        Returns: `NamedTuple` A namedtuple with the following keys:
            - predictions (:obj:`np.ndarray`): The predictions on :obj:`test_dataset`.
            - label_ids (:obj:`np.ndarray`, `optional`): The labels (if the dataset contained some).
            - metrics (:obj:`Dict[str, float]`, `optional`): The potential dictionary of metrics (if the dataset
              contained labels).
        """


        output, batch_size = self.prediction_loop(
                                            inputs,
                                            description="Evaluation",
                                            ignore_keys=ignore_keys,
                                            metric_key_prefix=metric_key_prefix)
        return {'output':output['eval_loss'], 'acc': output['eval_acc'], 'batch_size': batch_size[0]}


    def prediction_loop(
                    self,
                    inputs,
                    description: str,
                    ignore_keys: Optional[List[str]] = None,
                    metric_key_prefix: str = "eval",
            ) -> Union[Dict, List[int]]:
        """
        Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`.
        Works both with or without labels.
        """

        out_label_ids=None
        if 'labels' in inputs:
            out_label_ids = inputs['labels'].detach().cpu()

        if 'attention_mask' in inputs:
            attention_mask= inputs['attention_mask'].detach().cpu()

        losses_host = None
        preds_host  = None
        labels_host = None

        world_size = 1
        num_hosts  = 1
        eval_losses_gatherer = DistributedTensorGatherer(world_size, num_hosts, make_multiple_of=self.batch_size)
        if not self.prediction_loss_only:
            preds_gatherer = DistributedTensorGatherer(world_size, num_hosts)
            labels_gatherer = DistributedTensorGatherer(world_size, num_hosts)

        self.model.eval()
        if self.past_index >= 0:
            self._past = None

        loss, logits, _ = self.prediction_step(inputs, ignore_keys=ignore_keys, has_labels=True)
        if loss is not None:
            losses = loss.repeat(self.batch_size).cpu()
            losses_host = losses if losses_host is None else T.cat((losses_host, losses), dim=0)
        if logits is not None:
            preds_host = logits.detach().cpu() if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
        if out_label_ids is not None:
            labels_host = out_label_ids if labels_host is None else nested_concat(labels_host, out_label_ids, padding_index=-100)

        # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
        if self.eval_accumulation_steps is not None :
            eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"))
            if not self.prediction_loss_only:
                preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
                labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))

            # Set back to None to begin a new accumulation
            losses_host, preds_host, labels_host = None, None, None

        if self.past_index and hasattr(self, "_past"):
            # Clean the state at the end of the evaluation loop
            delattr(self, "_past")

        # Gather all remaining tensors and put them back on the CPU
        if num_hosts>1:
            eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses"), want_masked=True)
            if not self.prediction_loss_only:
                preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds"))
                labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids"))

            eval_loss = eval_losses_gatherer.finalize()
            preds = preds_gatherer.finalize() if not self.prediction_loss_only else None
            label_ids = labels_gatherer.finalize() if not self.prediction_loss_only else None
        else:
            eval_loss= losses_host
            preds    = preds_host
            label_ids= labels_host

        if preds is not None and label_ids is not None:
            metrics = ComputeMetrics.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids), attention_mask)
        else:
            metrics = {}

        if eval_loss is not None:
            metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item()

        # Prefix all keys with metric_key_prefix + '_'
        for key in list(metrics.keys()):
            if not key.startswith(f"{metric_key_prefix}_"):
                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key).item()
        return metrics, preds.size()


    def _gather_and_numpify(self, tensors, name):
        """
        Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before
        concatenating them to `gathered`
        """
        if tensors is None:
            return
        return nested_numpify(tensors)


    def prediction_step(
            self,
            inputs,
            ignore_keys: Optional[List[str]] = None, has_labels: bool = None
    ) -> Tuple[Optional[float], Optional[T.Tensor], Optional[T.Tensor]]:
        """
        Perform an evaluation step on :obj:`model` using obj:`inputs`.
        Subclass and override to inject custom behavior.
        Args:
            model (:obj:`nn.Module`):
                The model to evaluate.
            inputs (:obj:`Dict[str, Union[T.Tensor, Any]]`):
                The inputs and targets of the model.
                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
            prediction_loss_only (:obj:`bool`):
                Whether or not to return the loss only.
            ignore_keys (:obj:`Lst[str]`, `optional`):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.
        Return:
            Tuple[Optional[float], Optional[T.Tensor], Optional[T.Tensor]]: A tuple with the loss, logits and
            labels (each being optional).
        """


        inputs = self._prepare_inputs(inputs)

        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
        if has_labels:
            #labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
            labels = inputs["labels"].detach().cpu()
            if len(labels) == 1:
                labels = labels[0]
        else:
            labels = None

        with T.no_grad():
            if has_labels:
                loss, outputs = self.compute_loss(inputs, return_outputs=True)
                loss = loss.mean().detach()
                if isinstance(outputs, dict):
                    logits = outputs["logits"]
                else:
                    logits = outputs[1:]
            else:
                loss = None
                outputs = self.model(**inputs)
                if isinstance(outputs, dict):
                    logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
                else:
                    logits = outputs
                if self.past_index >= 0:
                    self._past = outputs[self.past_index - 1]

        if self.prediction_loss_only:
            return (loss, None, None)

        logits = nested_detach(logits)
        if len(logits) == 1:
            logits = logits[0]

        return (loss, logits, labels)


    def floating_point_ops(self, inputs):
        """
        For models that inherit from :class:`~transformers.PreTrainedModel`, uses that method to compute the number of
        floating point operations for every backward + forward pass. If using another model, either implement such a
        method in the model or subclass and override this method.
        Args:
            inputs (:obj:`Dict[str, Union[T.Tensor, Any]]`):
                The inputs and targets of the model.
        Returns:
            :obj:`int`: The number of floating-point operations.
        """
        if hasattr(self.model, "floating_point_ops"):
            return self.model.floating_point_ops(inputs)
        else:
            return 0


    def set_eval(self):
        """
        Bring the model into evaluation mode
        """
        self.model.eval()


    def set_train(self):
        """
        Bring the model into train mode
        """
        self.model.train()


================================================
FILE: experiments/mlm_bert/utils/trainer_pt_utils.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

# coding=utf-8
# Copyright 2020-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Torch utilities for the Trainer class.
"""

import json
import math
import os
import warnings
from contextlib import contextmanager
from dataclasses import dataclass
from typing import Dict, Iterator, List, Optional, Union

import numpy as np
import torch
from packaging import version
from torch.utils.data.dataset import Dataset
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, Sampler


# this is used to supress an undesired warning emitted by pytorch versions 1.4.2-1.7.0
try:
    from torch.optim.lr_scheduler import SAVE_STATE_WARNING
except ImportError:
    SAVE_STATE_WARNING = ""


def torch_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
    """Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary."""
    if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
        return torch.cat((tensor1, tensor2), dim=0)

    # Let's figure out the new shape
    new_shape = (tensor1.shape[0] + tensor2.shape[0], max(tensor1.shape[1], tensor2.shape[1])) + tensor1.shape[2:]

    # Now let's fill the result tensor
    result = tensor1.new_full(new_shape, padding_index)
    result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1
    result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2
    return result


def numpy_pad_and_concatenate(array1, array2, padding_index=-100):
    """Concatenates `array1` and `array2` on first axis, applying padding on the second if necessary."""
    if len(array1.shape) == 1 or array1.shape[1] == array2.shape[1]:
        return np.concatenate((array1, array2), dim=0)

    # Let's figure out the new shape
    new_shape = (array1.shape[0] + array2.shape[0], max(array1.shape[1], array2.shape[1])) + array1.shape[2:]

    # Now let's fill the result tensor
    result = np.full_like(array1, padding_index, shape=new_shape)
    result[: array1.shape[0], : array1.shape[1]] = array1
    result[array1.shape[0] :, : array2.shape[1]] = array2
    return result


def nested_concat(tensors, new_tensors, padding_index=-100):
    """
    Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or
    nested list/tuples of tensors.
    """
    assert type(tensors) == type(
        new_tensors
    ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
    if isinstance(tensors, (list, tuple)):
        return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
    elif isinstance(tensors, torch.Tensor):
        return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
    elif isinstance(tensors, np.ndarray):
        return numpy_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
    else:
        raise TypeError(f"Unsupported type for concatenation: got {type(tensors)}")


def nested_numpify(tensors):
    "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
    if isinstance(tensors, (list, tuple)):
        return type(tensors)(nested_numpify(t) for t in tensors)
    return tensors.cpu().numpy()


def nested_detach(tensors):
    "Detach `tensors` (even if it's a nested list/tuple of tensors)."
    if isinstance(tensors, (list, tuple)):
        return type(tensors)(nested_detach(t) for t in tensors)
    return tensors.detach()


def reissue_pt_warnings(caught_warnings):
    # Reissue warnings that are not the SAVE_STATE_WARNING
    if len(caught_warnings) > 1:
        for w in caught_warnings:
            if w.category != UserWarning or w.message != SAVE_STATE_WARNING:
                warnings.warn(w.message, w.category)


def nested_new_like(arrays, num_samples, padding_index=-100):
    """ Create the same nested structure as `arrays` with a first dimension always at `num_samples`."""
    if isinstance(arrays, (list, tuple)):
        return type(arrays)(nested_new_like(x, num_samples) for x in arrays)
    return np.full_like(arrays, padding_index, shape=(num_samples, *arrays.shape[1:]))


def nested_expand_like(arrays, new_seq_length, padding_index=-100):
    """ Expand the `arrays` so that the second dimension grows to `new_seq_length`. Uses `padding_index` for padding."""
    if isinstance(arrays, (list, tuple)):
        return type(arrays)(nested_expand_like(x, new_seq_length, padding_index=padding_index) for x in arrays)

    result = np.full_like(arrays, padding_index, shape=(arrays.shape[0], new_seq_length) + arrays.shape[2:])
    result[:, : arrays.shape[1]] = arrays
    return result


def nested_truncate(tensors, limit):
    "Truncate `tensors` at `limit` (even if it's a nested list/tuple of tensors)."
    if isinstance(tensors, (list, tuple)):
        return type(tensors)(nested_truncate(t, limit) for t in tensors)
    return tensors[:limit]


def _get_first_shape(arrays):
    """Return the shape of the first array found in the nested struct `arrays`."""
    if isinstance(arrays, (list, tuple)):
        return _get_first_shape(arrays[0])
    return arrays.shape


class DistributedTensorGatherer:
    """
    A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU by chunks.
    If our dataset has 16 samples with a batch size of 2 on 3 processes and we gather then transfer on CPU at every
    step, our sampler will generate the following indices:
        :obj:`[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]`
    to get something of size a multiple of 3 (so that each process gets the same dataset length). Then process 0, 1 and
    2 will be responsible of making predictions for the following samples:
        - P0: :obj:`[0, 1, 2, 3, 4, 5]`
        - P1: :obj:`[6, 7, 8, 9, 10, 11]`
        - P2: :obj:`[12, 13, 14, 15, 0, 1]`
    The first batch treated on each process will be
        - P0: :obj:`[0, 1]`
        - P1: :obj:`[6, 7]`
        - P2: :obj:`[12, 13]`
    So if we gather at the end of the first batch, we will get a tensor (nested list/tuple of tensor) corresponding to
    the following indices:
        :obj:`[0, 1, 6, 7, 12, 13]`
    If we directly concatenate our results without taking any precautions, the user will then get the predictions for
    the indices in this order at the end of the prediction loop:
        :obj:`[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]`
    For some reason, that's not going to roll their boat. This class is there to solve that problem.
    Args:
        world_size (:obj:`int`):
            The number of processes used in the distributed training.
        num_samples (:obj:`int`):
            The number of samples in our dataset.
        make_multiple_of (:obj:`int`, `optional`):
            If passed, the class assumes the datasets passed to each process are made to be a multiple of this argument
            (by adding samples).
        padding_index (:obj:`int`, `optional`, defaults to -100):
            The padding index to use if the arrays don't all have the same sequence length.
    """

    def __init__(self, world_size, num_samples, make_multiple_of=None, padding_index=-100):
        self.world_size = world_size
        self.num_samples = num_samples
        total_size = world_size if make_multiple_of is None else world_size * make_multiple_of
        self.total_samples = int(np.ceil(num_samples / total_size)) * total_size
        self.process_length = self.total_samples // world_size
        self._storage = None
        self._offsets = None
        self.padding_index = padding_index

    def add_arrays(self, arrays):
        """
        Add :obj:`arrays` to the internal storage, Will initialize the storage to the full size at the first arrays
        passed so that if we're bound to get an OOM, it happens at the beginning.
        """
        if arrays is None:
            return
        if self._storage is None:
            self._storage = nested_new_like(arrays, self.total_samples, padding_index=self.padding_index)
            self._offsets = list(range(0, self.total_samples, self.process_length))
        else:
            storage_shape = _get_first_shape(self._storage)
            arrays_shape = _get_first_shape(arrays)
            if len(storage_shape) > 1 and storage_shape[1] < arrays_shape[1]:
                # If we get new arrays that are too big too fit, we expand the shape fo the storage
                self._storage = nested_expand_like(self._storage, arrays_shape[1], padding_index=self.padding_index)
        slice_len = self._nested_set_tensors(self._storage, arrays)
        for i in range(self.world_size):
            self._offsets[i] += slice_len

    def _nested_set_tensors(self, storage, arrays):
        if isinstance(arrays, (list, tuple)):
            for x, y in zip(storage, arrays):
                slice_len = self._nested_set_tensors(x, y)
            return slice_len
        assert (
            arrays.shape[0] % self.world_size == 0
        ), f"Arrays passed should all have a first dimension multiple of {self.world_size}, found {arrays.shape[0]}."

        slice_len = arrays.shape[0] // self.world_size
        for i in range(self.world_size):
            if len(arrays.shape) == 1:
                storage[self._offsets[i] : self._offsets[i] + slice_len] = arrays[i * slice_len : (i + 1) * slice_len]
            else:
                storage[self._offsets[i] : self._offsets[i] + slice_len, : arrays.shape[1]] = arrays[
                    i * slice_len : (i + 1) * slice_len
                ]
        return slice_len

    def finalize(self):
        """
        Return the properly gathered arrays and truncate to the number of samples (since the sampler added some extras
        to get each process a dataset of the same length).
        """
        if self._storage is None:
            return
        if self._offsets[0] != self.process_length:
            logger.warn("Not all data has been set. Are you sure you passed all values?")
        return nested_truncate(self._storage, self.num_samples)


@dataclass
class LabelSmoother:
    """
    Adds label-smoothing on a pre-computed output from a Transformers model.
    Args:
        epsilon (:obj:`float`, `optional`, defaults to 0.1):
            The label smoothing factor.
        ignore_index (:obj:`int`, `optional`, defaults to -100):
            The index in the labels to ignore when computing the loss.
    """

    epsilon: float = 0.1
    ignore_index: int = -100

    def __call__(self, model_output, labels):
        logits = model_output["logits"] if isinstance(model_output, dict) else model_output[0]
        log_probs = -torch.nn.functional.log_softmax(logits, dim=-1)
        if labels.dim() == log_probs.dim() - 1:
            labels = labels.unsqueeze(-1)

        padding_mask = labels.eq(self.ignore_index)
        # In case the ignore_index is -100, the gather will fail, so we replace labels by 0. The padding_mask
        # will ignore them in any case.
        labels.clamp_min_(0)
        nll_loss = log_probs.gather(dim=-1, index=labels)
        smoothed_loss = log_probs.sum(dim=-1, keepdim=True)

        nll_loss.masked_fill_(padding_mask, 0.0)
        smoothed_loss.masked_fill_(padding_mask, 0.0)

        # Take the mean over the label dimensions, then divide by the number of active elements (i.e. not-padded):
        num_active_elements = padding_mask.numel() - padding_mask.long().sum()
        nll_loss = nll_loss.sum() / num_active_elements
        smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1])
        return (1 - self.epsilon) * nll_loss + self.epsilon * smoothed_loss


def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, generator=None):
    """
    Return a list of indices so that each slice of :obj:`batch_size` consecutive indices correspond to elements of
    similar lengths. To do this, the indices are:
    - randomly permuted
    - grouped in mega-batches of size :obj:`mega_batch_mult * batch_size`
    - sorted by length in each mega-batch
    The result is the concatenation of all mega-batches, with the batch of :obj:`batch_size` containing the element of
    maximum length placed first, so that an OOM happens sooner rather than later.
    """
    # Default for mega_batch_mult: 50 or the number to get 4 megabatches, whichever is smaller.
    if mega_batch_mult is None:
        mega_batch_mult = min(len(lengths) // (batch_size * 4), 50)
        # Just in case, for tiny datasets
        if mega_batch_mult == 0:
            mega_batch_mult = 1

    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
    indices = torch.randperm(len(lengths), generator=generator)
    megabatch_size = mega_batch_mult * batch_size
    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
    megabatches = [list(sorted(megabatch, key=lambda i: lengths[i], reverse=True)) for megabatch in megabatches]

    # The rest is to get the biggest batch first.
    # Since each megabatch is sorted by descending length, the longest element is the first
    megabatch_maximums = [lengths[megabatch[0]] for megabatch in megabatches]
    max_idx = torch.argmax(torch.tensor(megabatch_maximums)).item()
    # Switch to put the longest element in first position
    megabatches[0][0], megabatches[max_idx][0] = megabatches[max_idx][0], megabatches[0][0]

    return sum(megabatches, [])


class LengthGroupedSampler(Sampler):
    r"""
    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
    keeping a bit of randomness.
    """

    def __init__(self, dataset: Dataset, batch_size: int, lengths: Optional[List[int]] = None):
        self.dataset = dataset
        self.batch_size = batch_size
        if lengths is None:
            if not isinstance(dataset[0], dict) or "input_ids" not in dataset[0]:
                raise ValueError(
                    "Can only automatically infer lengths for datasets whose items are dictionaries with an "
                    "'input_ids' key."
                )
            lengths = [len(feature["input_ids"]) for feature in dataset]
        self.lengths = lengths

    def __len__(self):
        return len(self.lengths)

    def __iter__(self):
        indices = get_length_grouped_indices(self.lengths, self.batch_size)
        return iter(indices)


class DistributedLengthGroupedSampler(DistributedSampler):
    r"""
    Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same
    length while keeping a bit of randomness.
    """
    # Copied and adapted from PyTorch DistributedSampler.
    def __init__(
        self,
        dataset: Dataset,
        batch_size: int,
        num_replicas: Optional[int] = None,
        rank: Optional[int] = None,
        seed: int = 0,
        drop_last: bool = False,
        lengths: Optional[List[int]] = None,
    ):
        if num_replicas is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            num_replicas = dist.get_world_size()
        if rank is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            rank = dist.get_rank()
        self.dataset = dataset
        self.batch_size = batch_size
        self.num_replicas = num_replicas
        self.rank = rank
        self.epoch = 0
        self.drop_last = drop_last
        # If the dataset length is evenly divisible by # of replicas, then there
        # is no need to drop any data, since the dataset will be split equally.
        if self.drop_last and len(self.dataset) % self.num_replicas != 0:
            # Split to nearest available length that is evenly divisible.
            # This is to ensure each rank receives the same amount of data when
            # using this Sampler.
            self.num_samples = math.ceil((len(self.dataset) - self.num_replicas) / self.num_replicas)
        else:
            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
        self.total_size = self.num_samples * self.num_replicas
        self.seed = seed

        if lengths is None:
            if not isinstance(dataset[0], dict) or "input_ids" not in dataset[0]:
                raise ValueError(
                    "Can only automatically infer lengths for datasets whose items are dictionaries with an "
                    "'input_ids' key."
                )
            lengths = [len(feature["input_ids"]) for feature in dataset]
        self.lengths = lengths

    def __iter__(self) -> Iterator:
        # Deterministically shuffle based on epoch and seed
        g = torch.Generator()
        g.manual_seed(self.seed + self.epoch)
        indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=g)

        if not self.drop_last:
            # add extra samples to make it evenly divisible
            indices += indices[: (self.total_size - len(indices))]
        else:
            # remove tail of data to make it evenly divisible.
            indices = indices[: self.total_size]
        assert len(indices) == self.total_size

        # subsample
        indices = indices[self.rank : self.total_size : self.num_replicas]
        assert len(indices) == self.num_samples

        return iter(indices)


# In order to keep `trainer.py` compact and easy to understand, place any secondary PT Trainer
# helper methods here


def _get_learning_rate(self):
    if self.deepspeed:
        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
        # not run for the first few dozen steps while loss scale is too large, and thus during
        # that time `get_last_lr` will fail if called during that warm up stage, so work around it:
        try:
            last_lr = self.lr_scheduler.get_last_lr()[0]
        except AssertionError as e:
            if "need to call step" in str(e):
                logger.warn("tried to get lr value before scheduler/optimizer started stepping, returning lr=0")
                last_lr = 0
            else:
                raise
    else:
        last_lr = (
            # backward compatibility for pytorch schedulers
            self.lr_scheduler.get_last_lr()[0]
            if version.parse(torch.__version__) >= version.parse("1.4")
            else self.lr_scheduler.get_lr()[0]
        )
    return last_lr


def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]:
    """
    Reformat Trainer metrics values to a human-readable format
    Args:
        metrics (:obj:`Dict[str, float]`):
            The metrics returned from train/evaluate/predict
    Returns:
        metrics (:obj:`Dict[str, float]`): The reformatted metrics
    """

    metrics_copy = metrics.copy()
    for k, v in metrics_copy.items():
        if "_mem_" in k:
            metrics_copy[k] = f"{ v >> 20 }MB"
        elif k == "total_flos":
            metrics_copy[k] = f"{ int(v) >> 30 }GF"
        elif type(metrics_copy[k]) == float:
            metrics_copy[k] = round(v, 4)

    return metrics_copy


def log_metrics(self, split, metrics):
    """
    Log metrics in a specially formatted way
    Args:
        split (:obj:`str`):
            Mode/split name: one of ``train``, ``eval``, ``test``
        metrics (:obj:`Dict[str, float]`):
            The metrics returned from train/evaluate/predictmetrics: metrics dict
    """

    logger.info(f"***** {split} metrics *****")
    metrics_formatted = self.metrics_format(metrics)
    k_width = max(len(str(x)) for x in metrics_formatted.keys())
    v_width = max(len(str(x)) for x in metrics_formatted.values())
    for key in sorted(metrics_formatted.keys()):
        logger.info(f"  {key: <{k_width}} = {metrics_formatted[key]:>{v_width}}")


def save_metrics(self, split, metrics):
    """
    Save metrics into a json file for that split, e.g. ``train_results.json``.
    Args:
        split (:obj:`str`):
            Mode/split name: one of ``train``, ``eval``, ``test``, ``all``
        metrics (:obj:`Dict[str, float]`):
            The metrics returned from train/evaluate/predict
    """
    path = os.path.join(self.args.output_dir, f"{split}_results.json")
    with open(path, "w") as f:
        json.dump(metrics, f, indent=4, sort_keys=True)

================================================
FILE: experiments/mlm_bert/utils/trainer_utils.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

# coding=utf-8
# Copyright 2020-present the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utilities for the Trainer and TFTrainer class. Should be independent from PyTorch and TensorFlow.
"""

import random
from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
import numpy as np
import torch
import logging

from utils import print_rank


def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).
    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # ^^ safe to call this function even if cuda is not available


class EvalPrediction(NamedTuple):
    """
    Evaluation output (always contains labels), to be used to compute metrics.
    Parameters:
        predictions (:obj:`np.ndarray`): Predictions of the model.
        label_ids (:obj:`np.ndarray`): Targets to be matched.
    """

    predictions: Union[np.ndarray, Tuple[np.ndarray]]
    label_ids: np.ndarray


class PredictionOutput(NamedTuple):
    predictions: Union[np.ndarray, Tuple[np.ndarray]]
    label_ids: Optional[np.ndarray]
    metrics: Optional[Dict[str, float]]


class ComputeMetrics:
    def __init__(self, p: EvalPrediction, mask=None):
        self.EvalPrediction = EvalPrediction
        self.compute_metrics( self.EvalPrediction)

    @staticmethod
    def compute_metrics(p: EvalPrediction, mask=None):
        print_rank('Prediction Block Size: {}'.format(p.predictions.size()), loglevel=logging.DEBUG)
        if len(list(p.predictions.size()))<3:
            if len(list(p.predictions.size()))<2:
                print_rank('There is something REALLY wrong with prediction tensor:'.format(p.predictions.size()), loglevel=logging.INFO)
                return {'acc': torch.tensor(0.0)}
            print_rank('There is something wrong with prediction tensor:'.format(p.predictions.size()), loglevel=logging.INFO)
            preds = np.argmax(p.predictions, axis=1)
        else:
            preds = np.argmax(p.predictions, axis=2)

        if mask is None:
            return {'acc': (preds == p.label_ids).float().mean()}
        else:
            #valid = preds >1  # reject oov predictions even if they're correct.
            valid = mask==1
            return {'acc': (preds.eq(p.label_ids.cpu()) * valid.cpu()).float().mean()}


================================================
FILE: experiments/nlg_gru/README.md
================================================
# Simple example of a NLG task on Reddit Dataset

Instructions on how to run the experiment, given below.

## Preparing the data

For this experiment, we can create a dummy dataset by running the 
script located in `testing/create_data.py` as follows:

```code
    python create_data.py --task nlg_gru
```

A couple of scripts are provided in `utils/preprocessing` for preprocessing .tsv files
in case you want to use your own data.

## Creating a config file

All the parameters of the experiment are passed in a YAML file. An basic example is 
provided in `configs/hello_world_nlg_gru_json.yaml` with the suggested 
parameters for local runs. 

The example provided above is for running json files. If you want to try with HDF5 files
make sure to use the script `utils/preprocessing/from_json_to_hdf5.py` to convert the mock
data to HDF5 format.

## Running the experiment

Finally, to launch the experiment locally , it suffices to launch the `e2e_trainer.py`
script using torch.distributed , you can use as example the following line:

```code
    python -m torch.distributed.run --nproc_per_node=3 e2e_trainer.py -dataPath .\testing\mockup\ -outputPath scratch -config .\testing\configs\hello_world_nlg_gru.yaml -task nlg_gru -backend nccl
```

For submitting jobs in Azure ML, we have included the instructions in the `Experiments` 
section of the main `README.md`.

================================================
FILE: experiments/nlg_gru/config.py
================================================
from __future__ import annotations
from dataclasses import dataclass
import sys
sys.path.append('../../')
from core.config import ModelConfig, from_dict


@dataclass
class GRUConfig(ModelConfig):
    """nlg_gru configuration

The model configuration specifies model architecture, parameters, and initialization settings.

Attributes:
    embed_dim (int): specific to GRU models, embedding dimension.

    vocab_size (int): specific to GRU models, the vocabulary size.

    hidden_dim (int): specific to GRU models, the hidden size.

    weight_init (str): ``default``, or ``xavier_normal``, indicating how to randomly initialize the model weights.

    OOV_correct (bool): whether OOV predictions are evaluated as correct, or ignored.
"""
    embed_dim: int | None = None
    vocab_size: int | None = None
    hidden_dim: int | None = None
    weight_init: str = None
    OOV_correct: bool = False
    
    @staticmethod
    def from_dict(config) -> GRUConfig:
        return from_dict(GRUConfig, config)


================================================
FILE: experiments/nlg_gru/dataloaders/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import random
import torch
import numpy as np
from core.dataloader import BaseDataLoader
from torch.utils.data.distributed import DistributedSampler
from experiments.nlg_gru.dataloaders.dataset import Dataset
from utils.data_utils import BatchSampler, DynamicBatchSampler

class DataLoader(BaseDataLoader):
    """
    PyTorch dataloader for loading text data from
    text_dataset.
    """
    def __init__(self, mode, num_workers=0, **kwargs):

        args = kwargs['args']
        self.batch_size = args['batch_size']
        batch_sampler = None

        dataset = Dataset(
                        data   = kwargs['data'],
                        test_only    = not mode=="train",
                        vocab_dict   = args['vocab_dict'],
                        user_idx     = kwargs['user_idx'], 
                        max_num_words= args['max_num_words'],
                        preencoded   = args.get('preencoded', False))
        
        if mode == 'train':
            
            sampler = DistributedSampler(dataset,num_replicas=1,rank=0)
            sampler.set_epoch(random.randint(0, 10**10))
            batch_sampler = DynamicBatchSampler(sampler,
                                            frames_threshold = args['max_num_words'],
                                            max_batch_size   = self.batch_size,
                                            unsorted_batch   = args['unsorted_batch'],
                                            fps=1)

        elif mode == 'val' or mode == 'test':
            sampler = BatchSampler(dataset, batch_size=self.batch_size, randomize=False, drop_last=False)
            super().__init__(dataset,
                             batch_sampler=sampler,
                             num_workers=num_workers,
                             collate_fn=self.collate_fn,
                             pin_memory=args["pin_memory"])
            return

        if batch_sampler is None:
            super().__init__(dataset,
                             batch_size=self.batch_size,
                             sampler=sampler,
                             num_workers=num_workers,
                             collate_fn=self.collate_fn,
                             drop_last=True)
        else:
            super().__init__(dataset,
                             batch_sampler=batch_sampler,
                             num_workers=num_workers,
                             collate_fn=self.collate_fn,
                             pin_memory=args["pin_memory"])

    def collate_fn(self, batch):
        def pad_and_concat_feats(labels):
            batch_size = len(labels)
            max_len = max(len(l[0]) for l in labels)
            cat_labels = np.full((batch_size, max_len), -1)

            for e, l in enumerate(labels):
                cat_labels[e,:len(l[0])] = np.squeeze(l)
            return cat_labels


        src_seq, utt_ids = zip(*batch)
        x_len =  [len(s[0]) for s in src_seq]

        src_seq = pad_and_concat_feats(src_seq)
        packed  = {
                    'x': torch.from_numpy(src_seq).long(),
                    'x_len': x_len,
                    'utt_ids' : utt_ids,
                    'total_frames' : sum(x_len),
                    'total_frames_with_padding' : np.prod(src_seq.shape),
                    'loss_weight' : None
                }
        return packed
    

================================================
FILE: experiments/nlg_gru/dataloaders/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
import logging
import json

from utils import print_rank
from core.dataset import BaseDataset
from experiments.nlg_gru.utils.utility import *

class Dataset(BaseDataset):
    """
    Map a text source to the target text
    """
    
    def __init__(self, data, min_num_words=2, max_num_words=25, test_only=False, user_idx=0, vocab_dict=None, preencoded=False, **kwargs):

        self.utt_list = list()
        self.test_only = test_only
        self.max_num_words = max_num_words
        self.min_num_words = min_num_words
        self.preencoded = preencoded

        # Load the vocab
        self.vocab = load_vocab(kwargs['args']['vocab_dict']) if 'args' in kwargs else load_vocab(vocab_dict)
        self.vocab_size = len(self.vocab)

        # reading the jsonl for a specific user_idx
        self.load_data(data, user_idx)

    def __len__(self):
        """Return the length of the elements in the list."""
        return len(self.utt_list)


    def __getitem__(self, idx):
        """Find the index in the available data"""

        if self.preencoded:
            batch = np.array([self.utt_list[idx]['src_text']], dtype=np.int32)
        else:
            # case_backoff_batch tries to find the best capitalisation that will allow the word to be in vocabulary
            batch = case_backoff_batch([self.utt_list[idx]['src_text']], self.vocab.term_to_idx)
            batch = to_indices(self.vocab, batch)

        return  batch, self.user

    def load_data(self, orig_strct, user_idx):

        if isinstance(orig_strct, str):
            print('Loading json-file: ', orig_strct)
            with open(orig_strct, 'r') as fid:
                orig_strct = json.load(fid)


        self.user_list  = orig_strct['users']
        self.num_samples = orig_strct['num_samples']
        self.user_data  = orig_strct['user_data'] 
        self.user = 'test_only' if self.test_only else self.user_list[user_idx]

        if user_idx != -1:
            self.process_x(self.user_data)

    def process_x(self, user_data):
        print_rank('Processing data-structure: {} Utterances expected'.format(sum(self.num_samples)), loglevel=logging.DEBUG)
        for user in self.user_list:
            for e in user_data[user]['x']:
                utt={}
                utt['src_text'] = e if type(e) is list else e.split()
                utt['duration'] = len(e)
                if utt['duration']<= self.min_num_words:
                    continue

                if utt['duration'] > self.max_num_words:
                    utt['src_text'] = utt['src_text'][:self.max_num_words]
                    utt['duration'] = self.max_num_words
                utt["loss_weight"] = 1.0
                self.utt_list.append(utt)


================================================
FILE: experiments/nlg_gru/model.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch as T
from torch import Tensor
from typing import List, Tuple

from core.model import BaseModel
from utils import softmax, to_device

class GRU2(T.nn.Module):
    def __init__(self, input_size, hidden_size, input_bias, hidden_bias):
        super(GRU2, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.w_ih = T.nn.Linear(input_size, 3 * hidden_size, input_bias)
        self.w_hh = T.nn.Linear(hidden_size, 3 * hidden_size, hidden_bias)
        
    def _forward_cell(self, input : Tensor, hidden : Tensor) -> Tensor:
        g_i = self.w_ih(input)
        g_h = self.w_hh(hidden)
        i_r, i_i, i_n = g_i.chunk(3, 1)
        h_r, h_i, h_n = g_h.chunk(3, 1)
        reset_gate = T.sigmoid(i_r + h_r)
        input_gate = T.sigmoid(i_i + h_i)
        new_gate   = T.tanh(i_n + reset_gate * h_n)
        hy         = new_gate + input_gate * (hidden - new_gate)
        return hy
    
    def forward(self, input : Tensor) -> Tuple[Tensor, Tensor]:
        hiddens : List[Tensor] = [to_device(T.zeros((input.shape[0], self.hidden_size)))]
        for step in range(input.shape[1]):
            hidden = self._forward_cell(input[:, step], hiddens[-1])
            hiddens.append(hidden)
            
        return T.stack(hiddens, dim=1), hiddens[-1]
    

class Embedding(T.nn.Module):
    def __init__(self, vocab_size, embedding_size): 
        super(Embedding, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.table = T.nn.Parameter(T.zeros((vocab_size, embedding_size)))
        self.unembedding_bias = T.nn.Parameter(T.zeros(vocab_size))
        delta = (3 / self.table.shape[1]) ** 0.5
        T.nn.init.uniform_(self.table, -delta, delta)

    def forward(self, input : Tensor, embed : bool) -> Tensor:
        if embed:
            output = T.nn.functional.embedding(input, self.table)
        else:
            output = input @ self.table.t() + self.unembedding_bias
        return output
    

class GRU(BaseModel): #DLM_2_0
    def __init__(self, model_config, OOV_correct=False, dropout=0.0, topK_results=1, wantLogits=False, **kwargs):
        super(GRU, self).__init__()
        self.vocab_size = model_config['vocab_size']
        self.embedding_size = model_config['embed_dim']
        self.hidden_size = model_config['hidden_dim']
        self.embedding = Embedding(self.vocab_size, self.embedding_size)
        self.rnn = GRU2(self.embedding_size, self.hidden_size, True, True)
        self.squeeze = T.nn.Linear(self.hidden_size, self.embedding_size, bias=False)
        self.OOV_correct = OOV_correct
        self.topK_results = topK_results
        self.dropout=dropout
        self.wantLogits=wantLogits
        if self.dropout>0.0:
            self.drop_layer = T.nn.Dropout(p=self.dropout)

    def forward(self, input : T.Tensor) -> Tuple[Tensor, Tensor]:
        input = input['x'] if isinstance(input, dict) else input
        input = to_device(input)
        embedding = self.embedding(input, True)
        hiddens, state = self.rnn(embedding)
        if self.dropout>0.0:
            hiddens= self.drop_layer(hiddens)
        output = self.embedding(self.squeeze(hiddens), False)
        return output, state


    def loss(self, input : T.Tensor) -> T.Tensor:
        input = input['x'] if isinstance(input, dict) else input
        input = to_device(input)
        non_pad_mask = input >= 0
        input = input * non_pad_mask.long()
        non_pad_mask = non_pad_mask.view(-1)

        # Run the forward pass
        output, _ = self.forward(input[:, :-1])

        # Estimate the targets
        targets = input.view(-1)[non_pad_mask]
        preds   = output.view(-1, self.vocab_size)[non_pad_mask]

        # Estimate the loss
        return T.nn.functional.cross_entropy(preds, targets)


    def inference(self, input):
        input = input['x'] if isinstance(input, dict) else input
        input = to_device(input)
        non_pad_mask = input >= 0
        input = input * non_pad_mask.long()
        non_pad_mask = non_pad_mask.view(-1)
        output, _ = self.forward(input[:, :-1])

        # Apply mask to input/output
        targets = input.view(-1)[non_pad_mask]
        preds = output.view(-1, self.vocab_size)[non_pad_mask]

        # accuracy
        probs_topK, preds_topK = T.topk(preds, self.topK_results, sorted=True, dim=1)
        probs, preds = probs_topK[:,0], preds_topK[:,0]
        if self.OOV_correct:
            acc = preds.eq(targets).float().mean()
        else:
            valid = preds != 0  # reject oov predictions even if they're correct.
            acc = (preds.eq(targets) * valid).float().mean()

        if self.wantLogits:
            if 1:
                output=  {'probabilities': softmax(probs_topK.cpu().detach().numpy(), axis=1),
                               'predictions': preds_topK.cpu().detach().numpy(),
                               'labels': targets.cpu().detach().numpy()}
            else:
                output = {'probabilities': probs_topK.cpu().detach().numpy(),
                              'predictions': preds_topK.cpu().detach().numpy(),
                              'labels': targets.cpu().detach().numpy()}

        return {'output':output, 'acc': acc.item(), 'batch_size': input.shape[0]}


================================================
FILE: experiments/nlg_gru/utils/utility.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import os
import json
import time
from argparse import ArgumentParser

import numpy as np
from collections import namedtuple
from tqdm import tqdm

TR_UPPER = {ord('i'): 'İ'}
TR_LOWER = {ord('I'): 'ı'}

Vocab = namedtuple('Vocab', ['idx_to_term', 'term_to_idx'])


def load_vocab(url):
    """Load a vocabulary file.

    url -- string -- url to the txt file

    returns -- Vocab(idx_to_term=list, term_to_idx=dict)
    """
    term_to_idx = {}
    idx_to_term = []
    with open(url, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            word = line.strip()
            idx_to_term.append(word)
            term_to_idx[word] = i
    return Vocab(idx_to_term, term_to_idx)


def to_indices(vocab, batch, ndim=2, oov_idx=0, pad_idx=-1):
        """Convert a nested list of strings to a np.array of integers.
        
        vocab -- Vocab -- the vocabulary of the model
        
        batch -- [..[str]..] -- multidimensional batch

        ndim -- int -- number of dimensions in batch

        oov_idx -- int or None -- if specified, replace missing terms by
                   the given index, otherwise raise an error

        pad_idx -- int or None -- if specified, pad short last-dimension
                   as specified, otherwise raise an error

        raises -- ValueError -- if pad is required but pad_idx not specified
               -- KeyError -- if oov is required but oov_idx not specified

        returns -- np.array(int) -- term indices
        """
        #print_rank(f'to_indices: batch len: {len(batch)} ndim: {ndim}')
        if ndim == 1:
            return np.array(
                [(vocab.term_to_idx[term] if oov_idx is None else 
                        vocab.term_to_idx.get(term, oov_idx)) 
                            for term in batch],  dtype=np.int32)

        if ndim == 2:
            # note: in most circumstances there is only one example in the batch
            # as a result, padding is never applied. We rely on collate_fn to properly
            # apply padding.
            length = max(len(row) for row in batch)
            if pad_idx is None and min(len(row) for row in batch) != length:
                raise ValueError('Padding required, but no pad_idx provided')
            pad = length * [pad_idx]

            result = np.array(
                [[(vocab.term_to_idx[term] if oov_idx is None else
                        vocab.term_to_idx.get(term, oov_idx))
                            for term in row] + pad[len(row):]
                                for row in batch], dtype=np.int32)
            #print_rank(f'to_indices result: {result.shape}')
            return result

        # Flatten to a 2D batch, then recurse & reshape up (this ensures
        # padding is handled correctly)
        shape = [len(batch)]
        for _ in range(2, ndim):
            shape.append(len(batch[0]))
            batch = [item for sub_batch in batch for item in sub_batch]
        shape.append(-1)
        return to_indices(vocab, batch, ndim=2, oov_idx=oov_idx, pad_idx=pad_idx).reshape(*shape)

def case_backoff_batch(batch, vocab):
    """Perform capitalization backoff on words both to lower & initial-upper case variants.

    batch -- list(list(string)) -- batch of sentences of words, to back off

    vocab -- set(string) -- vocabulary to consider

    returns -- list(list(string)) -- backed-off batch
    """

    def _variants(word):
        yield word
        yield word.translate(TR_LOWER).lower()
        yield word.lower()
        if len(word) > 1:
            yield word[0].translate(TR_UPPER).capitalize() + word[1:]
        yield word.capitalize()

    return [[next((variant for variant in _variants(word) if variant in vocab),
                  word)  # will become OOV
             for word in sentence]
            for sentence in batch]


def encode_data(data_dict, vocab):
    '''Encode data that is in the format expected by FLUTE
    
    Parameters
    ----------
    data_dict: dict
        Dictionary where keys consist of usernames and values give
        the data for that user, specified by another dictionary with
        keys :code:`x` (features) and, optionally, :code:`y` (labels).
    vocab:

    Returns
    -------
    dict
        Dictionary in the same format as the input one, but now the
        data in the :code:`x` field is given by tokens (i.e., integers),
        instead of strings.
    '''
    new_dict = {}
    for key, value in tqdm(data_dict.items()):
        user_data = [s.split() for s in value['x']]
        processed_data = case_backoff_batch(user_data, vocab.term_to_idx)
        encoded_data = [[vocab.term_to_idx.get(term, 0) for term in row] for row in processed_data]
        new_dict[key] = {'x': encoded_data}

    return new_dict


if __name__ == '__main__':
    parser = ArgumentParser(description='Encodes data')
    parser.add_argument('data_path', type=str, help='Path to data')
    parser.add_argument('vocab_path', type=str, help='Path to vocabulary')
    args = parser.parse_args()

    if not os.path.isfile(args.data_path):
        raise ValueError('data file does not exist')
    if not os.path.isfile(args.vocab_path):
        raise ValueError('vocabulary file does not exist')
    if args.data_path[-5:] != '.json':
        raise ValueError('argument must be a valid json file')

    # Load vocabulary
    print('Loading vocabulary...')
    vocab = load_vocab(args.vocab_path)

    # Load and encode data
    print('Loading data... ', end='', flush=True)
    start_time = time.time()
    with open(args.data_path, 'r') as input_file:
        all_data = json.load(input_file)
    print(f'Finished in {time.time() - start_time:.2f}s')

    print('Converting data...')
    converted_user_data = encode_data(all_data['user_data'], vocab)
    
    # For debug purposes
    for k, v in converted_user_data.items():
        print(f'USER: {k}\nDATA: {v}')
        break

    # Save encoded data to disk
    print('Saving encoded data to disk...')
    all_data['user_data'] = converted_user_data
    with open(f'{args.data_path[:-5]}-encoded.json', 'w') as output_file:
        json.dump(all_data, output_file)

================================================
FILE: experiments/nlp_rnn_fedshakespeare/README.md
================================================
## FedML Benchmark

### Examples

The example in this folder was taken from [FedML](https://github.com/FedML-AI/FedML/tree/master/python/examples/simulation/mpi_fedavg_datasets_and_models_example) repository on its release 0.7.300, using the configuration suggested on their
[benchmarking results](https://doc.fedml.ai/simulation/benchmark/BENCHMARK_MPI.html) for MPI-Based Federated Learning (fastest on this version).

### Data

FLUTE will automatically download the data used for this example, otherwise you can use the scripts provided [here](https://github.com/FedML-AI/FedML/tree/master/python/fedml/data) for each independent dataset in the FedML GitHub repository. 

### Run

If you downloaded the data manually, make sure that the variable `data_cache_dir` has been updated inside `preprocess.py`. Later, you can run the experiment as follows:

```code

    python -m torch.distributed.run  --nproc_per_node=4  e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest  -config ./experiments/nlp_rnn_fedshakespeare/config.yaml -task nlp_rnn_fedshakespeare -backend nccl
    
```
### Results

This comparison was carried out using Parrot (Simulator) on version 0.7.303 at commit ID [8f7f261f](https://github.com/FedML-AI/FedML/tree/8f7f261f44e58d0cb5a416b0d6fa270b42a91049). 
```
 _____________________________________________________________________________
|                    |   FedML (MPI) - Fastest   |   FLUTE (NCCL)  - Fastest  |
| Task               | Acc | Time     | GPU Mem  | Acc | Time     | GPU Mem   |
|--------------------|-----|----------|----------|-----|----------|-----------|
| LR_MNIST           | ~81 | 00:03:09 | ~3060 MB | ~81 | 00:01:35 | ~1060 MB  |
| CNN_FEMNIST        | ~83 | 05:49:52 | ~5180 MB | ~83 | 00:08:22 | ~1770 MB  |
| RESNET_FEDCIFAR100 | ~34 | 15:55:36 | ~5530 MB | ~33 | 01:42:01 | ~1900 MB  |
| RNN_FEDSHAKESPEARE | ~57 | 06:46:21 | ~3690 MB | ~57 | 00:21:50 | ~1270 MB  |
 -----------------------------------------------------------------------------
```

### FedML Configuration file

In order to reproduce this experiment in FedML please use the setup below. 

```yaml

common_args:
  training_type: "simulation"
  random_seed: 0

data_args:
  dataset: "fed_shakespeare"
  data_cache_dir: ~/fedml_data
  partition_method: "hetero"
  partition_alpha: 0.5

model_args:
  model: "rnn"


train_args:
  federated_optimizer: "FedAvg"
  client_id_list: "[]"
  client_num_in_total: 715
  client_num_per_round: 10
  comm_round: 1200
  epochs: 1
  batch_size: 4
  client_optimizer: sgd
  learning_rate: 0.8
  weight_decay: 0.001

validation_args:
  frequency_of_the_test: 50

device_args:
  worker_num: 10
  using_gpu: true
  gpu_mapping_file: config/fedshakespeare_rnn/gpu_mapping.yaml
  gpu_mapping_key: mapping_default # [3, 3, 3, 2]

comm_args:
  backend: "MPI"
  is_mobile: 0

```

================================================
FILE: experiments/nlp_rnn_fedshakespeare/config.yaml
================================================
# Basic configuration file for running classif_cnn example using torchvision CIFAR10 dataset.
# Parameters needed to initialize the model
model_config:
    model_type: RNN                                # class w/ `loss` and `inference` methods
    model_folder: experiments/nlp_rnn_fedshakespeare/model.py     # file containing class

# Configuration for differential privacy
dp_config:
    enable_local_dp: false                             # whether to enable user-level DP

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false                               # cache data to compute additional metrics

# Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx)
strategy: FedAvg

# Determines all the server-side settings for training and evaluation rounds
server_config:   
    wantRL: false                                      # whether to use RL-based meta-optimizers
    resume_from_checkpoint: false                      # restart from checkpoint if file exists
    do_profiling: false                                # run profiler and compute runtime metrics
    optimizer_config:                                  # this is the optimizer used to update the model
        type: sgd
        lr: 1.0
    annealing_config:                                  # annealer for the learning rate
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 50000                                       # how many iterations between metric eval on val set
    rec_freq: 50                                     # how many iterations between metric eval on test set
    initial_val: false
    initial_rec: false
    max_iteration: 1200                               # how many iterations in total
    num_clients_per_iteration: 10                      # how many clients per iteration
    data_config:                                       # where to get val and test data from
        val:
            batch_size: 4
            val_data: null                             # Assigned to null because dataset is being instantiated
        test:
            batch_size: 4
            test_data: null                            # Assigned to null because dataset is being instantiated
    type: model_optimization
    aggregate_median: softmax                          # how aggregations weights are computed
    initial_lr_client: 0.8                           # learning rate used on client optimizer
    lr_decay_factor: 1.0
    weight_train_loss: train_loss
    best_model_criterion: loss
    fall_back_to_best_model: false
    softmax_beta: 1.0

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    do_profiling: false                                # run profiling and compute runtime metrics
    ignore_subtask: false
    data_config:                                       # where to get training data from
        train:
            batch_size: 4
            list_of_train_data: null                   # Assigned to null because dataset is being instantiated
            desired_max_samples: 5000
    optimizer_config:                                  # this is the optimizer used by the client
        type: sgd
        lr: 0.8                                      # this is overridden by `initial_lr_client`
    type: optimization

================================================
FILE: experiments/nlp_rnn_fedshakespeare/dataloaders/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
import numpy as np

from core.dataloader import BaseDataLoader
from experiments.nlp_rnn_fedshakespeare.dataloaders.dataset import Dataset

class DataLoader(BaseDataLoader):
    def __init__(self, mode, num_workers=0, **kwargs):
        args = kwargs['args']
        self.batch_size = args['batch_size']

        dataset = Dataset(
            data=kwargs['data'],
            test_only=(not mode=='train'),
            user_idx=kwargs.get('user_idx', None),
        )

        super().__init__(
            dataset,
            batch_size=self.batch_size,
            shuffle=(mode=='train'),
            num_workers=num_workers,
            collate_fn=self.collate_fn,
        )

    def collate_fn(self, batch):
        x, y = list(zip(*batch))
        x, y = np.array(x), np.array(y)
        return {'x': torch.tensor(x), 'y': torch.tensor(y)}

================================================
FILE: experiments/nlp_rnn_fedshakespeare/dataloaders/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
from core.dataset import BaseDataset
from experiments.nlp_rnn_fedshakespeare.dataloaders.preprocessing import FEDSHAKESPEARE

class Dataset(BaseDataset):
    def __init__(self, data, test_only=False, user_idx=0, **kwargs):
        self.test_only = test_only
        self.user_idx = user_idx

        # Get all data
        self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only)

        if user_idx == -1:
            self.user = self.user_list
            self.features = np.vstack([user_data for user_data in self.user_data.values()])
            self.labels = np.vstack([user_label for user_label in self.user_data_label.values()])
        else:
            if self.test_only:  # combine all data into single array
                self.user = 'test_only'
                self.features = np.vstack([user_data for user_data in self.user_data.values()])
                self.labels = np.vstack([user_label for user_label in self.user_data_label.values()])
            else:  # get a single user's data
                if user_idx is None:
                    raise ValueError('in train mode, user_idx must be specified')

                self.user = self.user_list[user_idx]
                self.features = self.user_data[self.user]
                self.labels = self.user_data_label[self.user]

    def __getitem__(self, idx):
        return np.array(self.features[idx]).astype(np.int32).T, self.labels[idx]

    def __len__(self):
        return len(self.features)

    def load_data(self, data, test_only):
        '''Wrapper method to read/instantiate the dataset'''

        if data == None:
            dataset = FEDSHAKESPEARE()
            data = dataset.testset if test_only else dataset.trainset
        
        users = data['users']
        features = data['user_data']
        labels = data['user_data_label']
        num_samples = data['num_samples']
            
        return users, features, labels, num_samples

================================================
FILE: experiments/nlp_rnn_fedshakespeare/dataloaders/preprocessing.py
================================================
import logging
import os
import wget
import tarfile
import h5py
import collections
import numpy as np

data_cache_dir = "./data"
DEFAULT_TRAIN_FILE = "shakespeare_train.h5"
DEFAULT_TEST_FILE = "shakespeare_test.h5"

word_dict = None
word_list = None
_pad = "<pad>"
_bos = "<bos>"
_eos = "<eos>"

''' 
    The FedeShakespeare dataset is taken from FedML repository. For more information regarding this dataset, 
    please refer to https://github.com/FedML-AI/FedML/tree/master/python/fedml/data/fed_shakespeare.

    In order to download the data run the following commands:
        - wget --no-check-certificate --no-proxy https://fedml.s3-us-west-1.amazonaws.com/shakespeare.tar.bz2
        - tar -xvf shakespeare.tar.bz2
    
    This code follows the steps of preprocessing in tff shakespeare dataset: 
    https://github.com/google-research/federated/blob/master/utils/datasets/shakespeare_dataset.py

'''

SEQUENCE_LENGTH = 80  # from McMahan et al AISTATS 2017
# Vocabulary re-used from the Federated Learning for Text Generation tutorial.
# https://www.tensorflow.org/federated/tutorials/federated_learning_for_text_generation

CHAR_VOCAB = list("dhlptx@DHLPTX $(,048cgkoswCGKOSW[_#'/37;?bfjnrvzBFJNRVZ\"&*.26:\naeimquyAEIMQUY]!%)-159\r")

def preprocess(sentences, max_seq_len=SEQUENCE_LENGTH):

    sequences = []

    def to_ids(sentence, num_oov_buckets=1):
        """
        map list of sentence to list of [idx..] and pad to max_seq_len + 1
        Args:
            num_oov_buckets : The number of out of vocabulary buckets.
            max_seq_len: Integer determining shape of padded batches.
        """
        tokens = [char_to_id(c) for c in sentence]
        tokens = [char_to_id(_bos)] + tokens + [char_to_id(_eos)]
        if len(tokens) % (max_seq_len + 1) != 0:
            pad_length = (-len(tokens)) % (max_seq_len + 1)
            tokens += [char_to_id(_pad)] * pad_length
        return (
            tokens[i : i + max_seq_len + 1]
            for i in range(0, len(tokens), max_seq_len + 1)
        )

    for sen in sentences:
        sequences.extend(to_ids(sen))
    return sequences

def char_to_id(char):
    word_dict = get_word_dict()
    if char in word_dict:
        return word_dict[char]
    else:
        return len(word_dict)

def get_word_dict():
    global word_dict
    if word_dict == None:
        words = [_pad] + CHAR_VOCAB + [_bos] + [_eos]
        word_dict = collections.OrderedDict()
        for i, w in enumerate(words):
            word_dict[w] = i
    return word_dict

def split(dataset):
    ds = np.asarray(dataset)
    x = ds[:, :-1]
    y = ds[:, 1:]
    return x, y

def download_files(data_cache_dir):

    URL = "https://fedml.s3-us-west-1.amazonaws.com/shakespeare.tar.bz2"

    if not os.path.exists(data_cache_dir):
        os.makedirs(data_cache_dir)

    file_path = os.path.join(data_cache_dir,"shakespeare.tar.bz2") 

    # Download and decompress the file (if we haven't already)
    if not os.path.exists(file_path):
        wget.download(URL, out=file_path)

        file = tarfile.open(file_path)
        file.extractall(os.path.join(data_cache_dir,'fed_shakespeare'))
        file.close()

class FEDSHAKESPEARE:
    def __init__(self) :

        download_files(data_cache_dir)
        train_h5 = h5py.File(os.path.join(data_cache_dir,'fed_shakespeare', DEFAULT_TRAIN_FILE), "r")
        test_h5 = h5py.File(os.path.join(data_cache_dir, 'fed_shakespeare',DEFAULT_TEST_FILE), "r")
        test_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()}
        train_dict = {'users': [], 'num_samples': [], 'user_data': dict(), 'user_data_label': dict()}

        for user in train_h5['examples'].keys():
            train_dict['users'].append(user)
            raw_train = train_h5['examples'][user]['snippets'][()]
            raw_train = [x.decode("utf8") for x in raw_train]
            user_data = preprocess(raw_train)
            train_dict['num_samples'].append(len(user_data))

            # split data
            train_x, train_y = split(user_data)
            train_dict['user_data'][user] = train_x
            train_dict['user_data_label'][user] = train_y

        for user in test_h5['examples'].keys():
            test_dict['users'].append(user)
            raw_test = test_h5['examples'][user]['snippets'][()]
            raw_test = [x.decode("utf8") for x in raw_test]
            user_data = preprocess(raw_test)
            test_dict['num_samples'].append(len(user_data))

            # split data
            test_x, test_y = split(user_data)
            test_dict['user_data'][user] = test_x
            test_dict['user_data_label'][user] = test_y
            
        print(" Dictionaries ready .. ")
        self.trainset, self.testset = train_dict, test_dict


================================================
FILE: experiments/nlp_rnn_fedshakespeare/model.py
================================================
import torch
from torch import nn
from torch.nn import functional as F
from core.model import BaseModel

''' 
    The CNN_DropOut model is taken from FedML repository. For more information regarding this model, 
    please refer to https://github.com/FedML-AI/FedML/blob/master/python/fedml/model/nlp/rnn.py.

'''

class nlp_rnn_fedshakespeare(nn.Module):
    def __init__(self, embedding_dim=8, vocab_size=90, hidden_size=256):
        super(nlp_rnn_fedshakespeare, self).__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=0
        )
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_seq):
        embeds = self.embeddings(input_seq)
        # Note that the order of mini-batch is random so there is no hidden relationship among batches.
        # So we do not input the previous batch's hidden state,
        # leaving the first hidden state zero `self.lstm(embeds, None)`.
        lstm_out, _ = self.lstm(embeds)
        # use the final hidden state as the next character prediction
        final_hidden_state = lstm_out[:, -1]
        # output = self.fc(final_hidden_state)
        # For fed_shakespeare
        output = self.fc(lstm_out[:, :])
        output = torch.transpose(output, 1, 2)
        return output

class RNN(BaseModel):
    '''This is a PyTorch model with some extra methods'''

    def __init__(self, model_config):
        super().__init__()
        self.net = nlp_rnn_fedshakespeare()

    def loss(self, input: torch.Tensor) -> torch.Tensor:
        '''Performs forward step and computes the loss'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        x, target = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(x)
        criterion = nn.CrossEntropyLoss(ignore_index=0).to(device)
        return criterion(output, target.long())

    def inference(self, input):
        '''Performs forward step and computes metrics'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        x, target = input['x'].to(device), input['y'].to(device)
        output = self.net.forward(x)
        n_samples = x.shape[0]
        
        pred = torch.argmax(output, dim=1)
        mask = (target != 0)
        accuracy = torch.sum((pred[mask] == target[mask]).float()).item()
        accuracy = accuracy/mask.sum()

        return {'output':output, 'acc': accuracy, 'batch_size': n_samples} 


================================================
FILE: experiments/semisupervision/README.md
================================================
### Data

In order to run this experiment, you need to previously run the script [cifar_dataset.py](dataloaders/cifar_dataset.py) in order to download and preprocess the CIFAR100 dataset needed for this task. 

```code

    python experiments/semisupervision/dataloaders/cifar_dataset.py
    
```
### Run

Once the data has been downloaded, you can run the experiment as follows:

```code

    python -m torch.distributed.run --nproc_per_node=2  e2e_trainer.py -dataPath ~/data -outputPath ~/outputTest  -config ./experiments/semisupervision/config.yaml -task semisupervision -backend nccl
    
```


================================================
FILE: experiments/semisupervision/config.yaml
================================================
# Basic configuration file for running semisupervision with data loaded on-the-fly
# Parameters needed to initialize the model
model_config:
    model_type: Res                               # class w/ `loss` and `inference` methods
    model_folder: experiments/semisupervision/model.py         # file containing class
    num_classes: 100

# Configuration for differential privacy
dp_config:
    enable_local_dp: false                             # whether to enable user-level DP

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false                               # cache data to compute additional metrics

# Select the Federated optimizer to use (e.g. DGA, FedAvg or FedProx)
strategy: FedLabels

# Determines all the server-side settings for training and evaluation rounds
server_config:
    send_dicts: true                                   # if true, the server will update model dictionaries instead of grads
    wantRL: false                                      # whether to use RL-based meta-optimizers
    resume_from_checkpoint: true                      # restart from checkpoint if file exists
    do_profiling: false                                # run profiler and compute runtime metrics
    optimizer_config:                                  # this is the optimizer used to update the model
        type: sgd
        lr: 1.0
    annealing_config:                                  # annealer for the learning rate
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 1                                       # how many iterations between metric eval on val set
    rec_freq: 5000                                      # how many iterations between metric eval on test set
    initial_val: true
    initial_rec: false
    max_iteration: 2000                                # how many iterations in total
    num_clients_per_iteration: 10                     # how many clients per iteration
    data_config:                                       # where to get val and test data from
        val:
            batch_size: 64
            val_data: null
        test:
            batch_size: 64
            test_data: null
    type: model_optimization
    aggregate_median: softmax                          # how aggregations weights are computed
    softmax_beta: 20.0
    initial_lr_client: 0.003                           # learning rate used on client optimizer
    lr_decay_factor: 1.0
    weight_train_loss: train_loss
    best_model_criterion: loss
    fall_back_to_best_model: false

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    do_profiling: false                                # run profiling and compute runtime metrics
    ignore_subtask: false
    data_config:                                       # where to get training data from
        train:
            batch_size: 64
            list_of_train_data: null
            desired_max_samples: 87000
    optimizer_config:                                  # this is the optimizer used by the client
        type: sgd 
        lr: 0.003                                      # this is overridden by `initial_lr_client`
        momentum: 0
    type: optimization
    semisupervision:
        uda: 1
        num_classes: 100
        isclust: 0
        alpha: 0.1
        train_ratio: 0.2
        test_ratio: 0.0
        val_ratio: 0.8
        vat_ptb: 0
        vat_consis: 0.05
        lamb_consist: 0.05
        unsup_lamb: 1
        l2_lambda: 10
        burnout_round: 50 
        thre: 0.3
        comp: var
        eta: 0.003
        bs: 64
        unl_bs: 128
        train_ep: 30
        unsuptrain_ep: 10
        ensize: 100
        seed: 0
        temp: 1
        device: cuda
        size: 10
        shuffle: 1

================================================
FILE: experiments/semisupervision/dataloaders/RandAugment.py
================================================
'''
Code in this file is adapted from rpmcruz/autoaugment
https://github.com/rpmcruz/autoaugment/blob/master/transformations.py

This code is modified version of https://github.com/ildoonet/pytorch-randaugment/blob/master/RandAugment/augmentations.py
for randaugmentation.
'''
import random

import PIL, PIL.ImageOps, PIL.ImageEnhance, PIL.ImageDraw
import numpy as np
import torch
from PIL import Image


def ShearX(img, v):  # [-0.3, 0.3]
    assert -0.3 <= v <= 0.3
    if random.random() > 0.5:
        v = -v
    return img.transform(img.size, PIL.Image.AFFINE, (1, v, 0, 0, 1, 0))


def ShearY(img, v):  # [-0.3, 0.3]
    assert -0.3 <= v <= 0.3
    if random.random() > 0.5:
        v = -v
    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, 0, v, 1, 0))


def TranslateX(img, v):  # [-150, 150] => percentage: [-0.45, 0.45]
    assert -0.45 <= v <= 0.45
    if random.random() > 0.5:
        v = -v
    v = v * img.size[0]
    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, v, 0, 1, 0))


def TranslateXabs(img, v):  # [-150, 150] => percentage: [-0.45, 0.45]
    assert 0 <= v
    if random.random() > 0.5:
        v = -v
    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, v, 0, 1, 0))


def TranslateY(img, v):  # [-150, 150] => percentage: [-0.45, 0.45]
    assert -0.45 <= v <= 0.45
    if random.random() > 0.5:
        v = -v
    v = v * img.size[1]
    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, 0, 0, 1, v))


def TranslateYabs(img, v):  # [-150, 150] => percentage: [-0.45, 0.45]
    assert 0 <= v
    if random.random() > 0.5:
        v = -v
    return img.transform(img.size, PIL.Image.AFFINE, (1, 0, 0, 0, 1, v))


def Rotate(img, v):  # [-30, 30]
    assert -30 <= v <= 30
    if random.random() > 0.5:
        v = -v
    return img.rotate(v)


def AutoContrast(img, _):
    return PIL.ImageOps.autocontrast(img)


def Invert(img, _):
    return PIL.ImageOps.invert(img)


def Equalize(img, _):
    return PIL.ImageOps.equalize(img)


def Flip(img, _):  # not from the paper
    return PIL.ImageOps.mirror(img)


def Solarize(img, v):  # [0, 256]
    assert 0 <= v <= 256
    return PIL.ImageOps.solarize(img, v)


def SolarizeAdd(img, addition=0, threshold=128):
    img_np = np.array(img).astype(np.int)
    img_np = img_np + addition
    img_np = np.clip(img_np, 0, 255)
    img_np = img_np.astype(np.uint8)
    img = Image.fromarray(img_np)
    return PIL.ImageOps.solarize(img, threshold)


def Posterize(img, v):  # [4, 8]
    v = int(v)
    v = max(1, v)
    return PIL.ImageOps.posterize(img, v)


def Contrast(img, v):  # [0.1,1.9]
    assert 0.1 <= v <= 1.9
    return PIL.ImageEnhance.Contrast(img).enhance(v)


def Color(img, v):  # [0.1,1.9]
    assert 0.1 <= v <= 1.9
    return PIL.ImageEnhance.Color(img).enhance(v)


def Brightness(img, v):  # [0.1,1.9]
    assert 0.1 <= v <= 1.9
    return PIL.ImageEnhance.Brightness(img).enhance(v)


def Sharpness(img, v):  # [0.1,1.9]
    assert 0.1 <= v <= 1.9
    return PIL.ImageEnhance.Sharpness(img).enhance(v)


def Cutout(img, v):  # [0, 60] => percentage: [0, 0.2]
    assert 0.0 <= v <= 0.2
    if v <= 0.:
        return img

    v = v * img.size[0]
    return CutoutAbs(img, v)


def CutoutAbs(img, v):  # [0, 60] => percentage: [0, 0.2]
    # assert 0 <= v <= 20
    if v < 0:
        return img
    w, h = img.size
    x0 = np.random.uniform(w)
    y0 = np.random.uniform(h)

    x0 = int(max(0, x0 - v / 2.))
    y0 = int(max(0, y0 - v / 2.))
    x1 = min(w, x0 + v)
    y1 = min(h, y0 + v)

    xy = (x0, y0, x1, y1)
    color = (125, 123, 114)
    # color = (0, 0, 0)
    img = img.copy()
    #print(img)
    PIL.ImageDraw.Draw(img).rectangle(xy, color)
    return img


def SamplePairing(imgs):  # [0, 0.4]
    def f(img1, v):
        i = np.random.choice(len(imgs))
        img2 = PIL.Image.fromarray(imgs[i])
        return PIL.Image.blend(img1, img2, v)

    return f


def Identity(img, v):
    return img


def augment_list(grey):  # 16 oeprations and their ranges
    # https://github.com/google-research/uda/blob/master/image/randaugment/policies.py#L57
    # l = [
    #     (Identity, 0., 1.0),
    #     (ShearX, 0., 0.3),  # 0
    #     (ShearY, 0., 0.3),  # 1
    #     (TranslateX, 0., 0.33),  # 2
    #     (TranslateY, 0., 0.33),  # 3
    #     (Rotate, 0, 30),  # 4
    #     (AutoContrast, 0, 1),  # 5
    #     (Invert, 0, 1),  # 6
    #     (Equalize, 0, 1),  # 7
    #     (Solarize, 0, 110),  # 8
    #     (Posterize, 4, 8),  # 9
    #     # (Contrast, 0.1, 1.9),  # 10
    #     (Color, 0.1, 1.9),  # 11
    #     (Brightness, 0.1, 1.9),  # 12
    #     (Sharpness, 0.1, 1.9),  # 13
    #     # (Cutout, 0, 0.2),  # 14
    #     # (SamplePairing(imgs), 0, 0.4),  # 15
    # ]

    if grey:
        # https://github.com/tensorflow/tpu/blob/8462d083dd89489a79e3200bcc8d4063bf362186/models/official/efficientnet/autoaugment.py#L505
        l = [
            (AutoContrast, 0, 1),
            (Equalize, 0, 1),
            (Invert, 0, 1),
            (Rotate, 0, 30),
            (Posterize, 0, 4),
            (Solarize, 0, 256),
            (SolarizeAdd, 0, 110),
            (Color, 0.1, 1.9),
            (Contrast, 0.1, 1.9),
            (Brightness, 0.1, 1.9),
            (Sharpness, 0.1, 1.9),
            (ShearX, 0., 0.3),
            (ShearY, 0., 0.3),
            (TranslateXabs, 0., 100),
            (TranslateYabs, 0., 100),
        ]

    else:
        l = [
            (AutoContrast, 0, 1),
            (Equalize, 0, 1),
            (Invert, 0, 1),
            (Rotate, 0, 30),
            (Posterize, 0, 4),
            (Solarize, 0, 256),
            (SolarizeAdd, 0, 110),
            (Color, 0.1, 1.9),
            (Contrast, 0.1, 1.9),
            (Brightness, 0.1, 1.9),
            (Sharpness, 0.1, 1.9),
            (ShearX, 0., 0.3),
            (ShearY, 0., 0.3),
            (CutoutAbs, 0, 40),
            (TranslateXabs, 0., 100),
            (TranslateYabs, 0., 100),
        ]
    return l


class Lighting(object):
    """Lighting noise(AlexNet - style PCA - based noise)"""

    def __init__(self, alphastd, eigval, eigvec):
        self.alphastd = alphastd
        self.eigval = torch.Tensor(eigval)
        self.eigvec = torch.Tensor(eigvec)

    def __call__(self, img):
        if self.alphastd == 0:
            return img

        alpha = img.new().resize_(3).normal_(0, self.alphastd)
        rgb = self.eigvec.type_as(img).clone() \
            .mul(alpha.view(1, 3).expand(3, 3)) \
            .mul(self.eigval.view(1, 3).expand(3, 3)) \
            .sum(1).squeeze()

        return img.add(rgb.view(3, 1, 1).expand_as(img))


class CutoutDefault(object):
    """
    Reference : https://github.com/quark0/darts/blob/master/cnn/utils.py
    """
    def __init__(self, length):
        self.length = length

    def __call__(self, img):
        h, w = img.size(1), img.size(2)
        mask = np.ones((h, w), np.float32)
        y = np.random.randint(h)
        x = np.random.randint(w)

        y1 = np.clip(y - self.length // 2, 0, h)
        y2 = np.clip(y + self.length // 2, 0, h)
        x1 = np.clip(x - self.length // 2, 0, w)
        x2 = np.clip(x + self.length // 2, 0, w)

        mask[y1: y2, x1: x2] = 0.
        mask = torch.from_numpy(mask)
        mask = mask.expand_as(img)
        img *= mask
        return img


class RandAugment:
    def __init__(self, n, m, grey=False):
        self.n = n
        self.m = m      # [0, 30]
        self.augment_list = augment_list(grey)

    def __call__(self, img):
        ops = random.choices(self.augment_list, k=self.n)
        #print(ops)
        for op, minval, maxval in ops:
            val = (float(self.m) / 30) * float(maxval - minval) + minval
            img = op(img, val)

        return img

================================================
FILE: experiments/semisupervision/dataloaders/cifar_dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import os 
import time
import json

import torch
import numpy as np
import pathlib

from torchvision import datasets, transforms
from torch.utils.data import TensorDataset, DataLoader
from numpy.random import RandomState

TRAINSET = "trainset.json"
TRAINSET_UNLAB = "trainset_unlab.json"
TRAINSET_UNLAB_RAND = "trainset_unlab_rand.json"
TESTSET = "testset.json"
ROOT = './data'


class CIFAR100:
    def __init__(self, user_idx=None, test_only=None, args=None, read_data=True) :
        if read_data: # Reads the data previously saved on files
            if user_idx == -1:
                if test_only:
                    print("Reading testing file")
                    file = os.path.join(ROOT,TESTSET)
                else:
                    print("Reading training labeled file")
                    file = os.path.join(ROOT,TRAINSET)
            elif user_idx == -2:
                print("Reading unlabeled training file")
                file = os.path.join(ROOT, TRAINSET_UNLAB)
            elif user_idx == -3:
                print("Reading unlabeled random training file")
                file = os.path.join(ROOT, TRAINSET_UNLAB_RAND)

            with open(file, 'r') as f:
                json_file = json.load(f)

            self.data = json_file
        else: # Create, preprocess and save the datasets
            from RandAugment import RandAugment
            trans = transforms.Compose(
                [transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
            transform_train = transforms.Compose([
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

            transform_unlabeltrain = transforms.Compose([ 
                RandAugment(1, 10),
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

            # Download and preprocess datasets
            trainset = datasets.CIFAR100('./data', train=True, download=True, transform=transform_train)
            unlabel_trainset = datasets.CIFAR100('./data', train=True, download=True, transform=transform_unlabeltrain)
            self.pretestset = datasets.CIFAR100('./data', train=False, download=True, transform=trans)

            train_loader = DataLoader(trainset, batch_size=len(trainset))
            ultrain_loader = DataLoader(unlabel_trainset, batch_size=len(unlabel_trainset))

            X_train = next(iter(train_loader))[0].numpy()
            Y_train = next(iter(train_loader))[1].numpy()
            X_unlabel_train = next(iter(ultrain_loader))[0].numpy()
            Y_unlabel_train = next(iter(ultrain_loader))[1].numpy()

            self.pretrainset, trainset_unlab_rand, trainset_unlab, \
            self.embed_dim = partition_imagedataset(X_train, Y_train, X_unlabel_train, Y_unlabel_train,args)
            self.trainset = _process(self.pretrainset, train=True)
            self.trainset_unlab = _process(trainset_unlab, train=True)
            self.trainset_unlab_rand = _process(trainset_unlab_rand, train=True)
            self.testset = _process(self.pretestset, train=False)

            save_json(self.trainset, TRAINSET)
            save_json(self.trainset_unlab, TRAINSET_UNLAB)
            save_json(self.trainset_unlab_rand, TRAINSET_UNLAB_RAND)
            save_json(self.testset, TESTSET)

def save_json(dict, filename):
    f = open(os.path.join('./data',filename), "w")
    json.dump(dict,f)
    f.close()

def _process(dataset, train=True):
    '''Process a Torchvision/preprocessed dataset to expected FLUTE format'''

    print('Converting data to expected format...')
    start_time = time.time()

    data_dict = {'users':[], 'num_samples': [], 'user_data':{}, 'user_data_label':{}}
    
    for i in range(len(dataset)):

        if train:
            x, y = dataset[i]['x'], dataset[i]['y']
        else:
            x, y = dataset[i]

        data_dict['users'].append(f'{i:04d}')
        data_dict['num_samples'].append(len(y) if train else 1)
        data_dict['user_data'][f'{i:04d}'] = [xi.tolist() for xi in x] if train else [x.tolist()]
        data_dict['user_data_label'][f'{i:04d}'] = [yi.tolist() for yi in y] if train else y

    print(f'Finished converting data in {time.time() - start_time:.2f}s.')

    return data_dict

def partition_imagedataset(X_train, Y_train, X_unlabel_train, Y_unlabel_train, args):

    if args['isclust'] == 1:
        partition = __getClusteredData__(Y_train, args['ensize'])

    elif args['isclust'] == 2:
        partition = __getClusteredMixedData__(Y_train, args['ensize'])
    else:
        partition = __getDirichletData__(Y_train, args)

    dataset_train = []
    dataset_val = []
    dataset_val_norand = []
    dataset_test = []

    train_ratio = args['train_ratio']
    val_ratio = args['val_ratio']
    test_ratio = args['test_ratio']
    x_for_embed = np.shape(X_train[0])
    for (i, ind) in enumerate(partition):

        x = X_train[ind]
        y = Y_train[ind]

        x_ul = X_unlabel_train[ind]
        y_ul = Y_unlabel_train[ind]

        n_i = len(ind)

        train_size = int(train_ratio * n_i)
        val_size = int(val_ratio * n_i) 
        test_size = int(test_ratio * n_i)

        x_train = torch.Tensor(x[val_size:val_size + train_size])
        y_train = torch.LongTensor(y[val_size:val_size + train_size])

        dataset_train_torch = {'x': x_train, 'y':y_train}

        if val_size == 0:
            x_val = x_train
            y_cal = y_train
            dataset_val_torch = dataset_train_torch
            dataset_val_torch_norand = dataset_train_torch
        else:
            x_val = torch.Tensor(x[:val_size])
            y_val = torch.LongTensor(y[:val_size])
            x_ul_val = torch.Tensor(x_ul[:val_size])
            y_ul_val = torch.LongTensor(y_ul[:val_size])
            dataset_val_torch = {'x': x_ul_val, 'y': y_ul_val}
            dataset_val_torch_norand = {'x':x_val, 'y':y_val}

        dataset_train.append(dataset_train_torch)
        dataset_val.append(dataset_val_torch)
        dataset_val_norand.append(dataset_val_torch_norand)

    return dataset_train, dataset_val, dataset_val_norand, x_for_embed

def __getDirichletData__(y, args):

    n = args['ensize']
    n_nets = args['ensize']
    K = args['num_classes']
    num_c = args['num_classes']
    labelList_true = y

    min_size = 0
    N = len(labelList_true)
    rnd = 0
    rann = RandomState(rnd)
    net_dataidx_map = {}
    p_client = np.zeros((n, num_c))

    for i in range(n):
        p_client[i] = rann.dirichlet(np.repeat(args['alpha'], num_c))

    idx_batch = [[] for _ in range(n_nets)]

    for k in range(K):
        idx_k = np.where(labelList_true == k)[0]
        rann.shuffle(idx_k)
        proportions = p_client[:, k]
        proportions = proportions / proportions.sum()
        proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1]
        idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))]

    for j in range(n_nets):
        if args['shuffle'] == 1:
            rann.shuffle(idx_batch[j])

        net_dataidx_map[j] = idx_batch[j]

    net_cls_counts_label = {}
    net_cls_counts_unlabel = {}

    for net_i in range(len(idx_batch)):
        n_i = len(idx_batch[net_i])
        train_size = int(args['train_ratio'] * n_i)
        val_size = int(args['val_ratio'] * n_i)
        unq, unq_cnt = np.unique(labelList_true[idx_batch[net_i][val_size:val_size + train_size]], return_counts=True)
        tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))}
        net_cls_counts_label[net_i] = tmp

        unq1, unq_cnt1 = np.unique(labelList_true[idx_batch[net_i][:val_size]], return_counts=True)
        tmp1 = {unq1[i]: unq_cnt1[i] for i in range(len(unq1))}
        net_cls_counts_unlabel[net_i] = tmp1

    local_sizes = []
    for i in range(n_nets):
        local_sizes.append(len(net_dataidx_map[i]))
    local_sizes = np.array(local_sizes)
    weights = local_sizes / np.sum(local_sizes)

    return idx_batch

if __name__ == "__main__":

    # Download and preprocess data
    args= {'name': 'FedVATnew', 'isaml':0, 'uda':1 , 'dataset': 'cifar100',
            'num_classes': 100, 'isclust': 0, 'alpha': 0.1, 'train_ratio': 0.2, 'val_ratio':0.8,
            'shuffle':1, 'vat_ptb':0.0 , 'vat_consis':0.05, 'unsup_lamb':1, 'l2_lambda':10,
            'bo': 50, 'thre': 0.3, 'comp': 'var', 'eta': 0.003, 'bs':64, 'unl_bs':128, 'train_ep':30,
            'unsuptrain_ep':10, 'rounds':2000, 'ensize':100, 'size': 10, 'model': 'RES50', 'seed': 0,
            'test_ratio': 0.0}

    data = CIFAR100(read_data=False, args=args)

================================================
FILE: experiments/semisupervision/dataloaders/dataloader.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import torch
import numpy as np

from core.dataloader import BaseDataLoader
from experiments.semisupervision.dataloaders.dataset import Dataset

class DataLoader(BaseDataLoader):
    def __init__(self, mode, num_workers=0, **kwargs):
        args = kwargs['args']
        self.batch_size = args['batch_size']

        dataset = Dataset(
            data=kwargs['data'],
            test_only=(not mode=='train'),
            user_idx=kwargs.get('user_idx', None),
        )

        super().__init__(
            dataset,
            batch_size=self.batch_size,
            shuffle=(mode=='train'),
            num_workers=num_workers,
            collate_fn=self.collate_fn,
        )

    def collate_fn(self, batch):
        x, y = list(zip(*batch))
        x = np.array(x)
        y = np.array(y)
        return {'x': torch.tensor(x), 'y': torch.tensor(y)}

================================================
FILE: experiments/semisupervision/dataloaders/dataset.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
from core.dataset import BaseDataset
from experiments.semisupervision.dataloaders.cifar_dataset import CIFAR100

class Dataset(BaseDataset):
    def __init__(self, data, test_only=False, user_idx=0, **kwargs):
        self.test_only = test_only
        self.user_idx = user_idx
        args = kwargs.get('args',None)
        
        # Get all data
        self.user_list, self.user_data, self.user_data_label, self.num_samples = self.load_data(data, self.test_only, args)

        if user_idx != -1:
            if self.test_only:  # combine all data into single array
                self.user = 'test_only'
                self.features = np.vstack([user_data for user_data in self.user_data.values()])
                self.labels = np.hstack([user_label for user_label in self.user_data_label.values()])
            else:  # get a single user's data
                if user_idx is None:
                    raise ValueError('in train mode, user_idx must be specified')

                self.user = self.user_list[user_idx]
                self.features = self.user_data[self.user]
                self.labels = self.user_data_label[self.user]

    def __getitem__(self, idx):
        return np.array(self.features[idx]).astype(np.float32), self.labels[idx]

    def __len__(self):
        return len(self.features)

    def load_data(self, data, test_only, sup_config):
        '''Wrapper method to read/instantiate the dataset'''

        if data == None:
            dataset = CIFAR100(self.user_idx, test_only, sup_config)
            data = dataset.data
        
        users = data['users']
        features = data['user_data']
        labels = data['user_data_label']
        num_samples = data['num_samples']
            
        return users, features, labels, num_samples

================================================
FILE: experiments/semisupervision/model.py
================================================
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from core.model import BaseModel

'''ResNet in PyTorch.

Reference:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
    Deep Residual Learning for Image Recognition. arXiv:1512.03385
'''

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10, inchannels = 3):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(inchannels, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18(num_classes=10):
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes)

def ResNet18_emnist(num_classes=62, inchannel = 1):
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes, inchannel)

def ResNet18_organ(num_classes=11, inchannel = 1):
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes, inchannel)

def ResNet18_path(num_classes=9, inchannel = 3):
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes, inchannel)

def ResNet18_blood(num_classes=8, inchannel = 3):
    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes, inchannel)

def ResNet34(num_classes=10):
    return ResNet(BasicBlock, [3, 4, 6, 3], num_classes)

def ResNet50(num_classes=10):
    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes)

def ResNet101(num_classes=10):
    return ResNet(Bottleneck, [3, 4, 23, 3], num_classes)

def ResNet152(num_classes=10):
    return ResNet(Bottleneck, [3, 8, 36, 3], num_classes)

def test():
    net = ResNet18()
    y = net(torch.randn(1, 3, 32, 32))
    print(y.size())


class Res(BaseModel):
    '''This is a PyTorch model with some extra methods'''

    def __init__(self, model_config):
        super().__init__()
        self.net = ResNet50(num_classes=model_config['num_classes'])
    
    def forward(self,x):
        return self.net.forward(x)

    def loss(self, input: torch.Tensor) -> torch.Tensor:
        '''Performs forward step and computes the loss'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)
        log_probs = self.net.forward(features)

        if not self.net.training: # For evaluation
            loss = F.cross_entropy(log_probs, labels, reduction='sum')
            loss /= labels.size(0)
        else:   
            loss_func = torch.nn.CrossEntropyLoss()
            loss = loss_func(log_probs, labels)

        return loss

    def inference(self, input):
        '''Performs forward step and computes metrics'''
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        features, labels = input['x'].to(device), input['y'].to(device)

        Softmax = torch.nn.LogSoftmax(dim=1)

        if len(np.shape(labels)) == 0:
                labels = torch.stack([labels])
        
        output = self.net.forward(features)
        log_probs = Softmax(output)
        _, predicted = log_probs.max(1)
        accuracy = predicted.eq(labels).sum().item() * 100
        n_samples = labels.size(0)

        return {'output':output, 'acc': accuracy/n_samples, 'batch_size': n_samples} 


================================================
FILE: extensions/RL/RL.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
import os
import json
import random
import torch
import torch.nn as nn
import numpy as np
from collections import OrderedDict
from utils import ( make_lr_scheduler,
                    print_rank,
                    torch_save,
                    try_except_save,
                    make_optimizer,
                    to_device)

class SequenceWise(nn.Module):
    def __init__(self, module):
        """
        Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
        Allows handling of variable sequence lengths and minibatch sizes.
        :param module: Module to apply input to.
        """
        super(SequenceWise, self).__init__()
        self.module = module

    def forward(self, x):
        t, n = x.size(0), x.size(1)
        x = x.view(t * n, -1)
        x = x.contiguous()
        x = self.module(x)
        x = x.view(t, n, -1)
        return x

    def __repr__(self):
        tmpstr = self.__class__.__name__ + ' (\n'
        tmpstr += self.module.__repr__()
        tmpstr += ')'
        return tmpstr


class BatchRNN(nn.Module):
    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True,dropout=0.0,multi=1):
        super(BatchRNN, self).__init__()
        self.input_size     = input_size
        self.hidden_size    = hidden_size
        self.batch_norm_activate = batch_norm
        self.bidirectional  = bidirectional
        self.multi          = multi
        self.dropout        = dropout

        if self.batch_norm_activate:
            self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size))
        self.rnn = rnn_type(input_size   = input_size,
                            hidden_size  = hidden_size,
                            bidirectional= bidirectional,
                            bias         = True,
                            batch_first  = True,
                            dropout      = self.dropout)
        self.num_directions = 2 if bidirectional else 1


    def forward(self, x):
        if x.dim()==2:
            x=x.unsqueeze(1)

        if self.batch_norm_activate:
            x = x.contiguous()
            x = self.batch_norm(x)
        x, _ = self.rnn(x)

        if self.bidirectional and self.multi<2:
            x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)
        return x


class NeuralNetwork(nn.Module):
    def __init__(self, params, wantLSTM=False, batch_norm=False):
        super(NeuralNetwork, self).__init__()

        """
        The following parameters need revisiting
        self.number_of_actions = 2
        self.gamma = 0.99
        self.final_epsilon = 0.0001
        self.initial_epsilon = 0.1
        self.number_of_iterations = 2000000
        self.replay_memory_size = 10000
        self.minibatch_size = 32

        optimizer = optim.Adam(model.parameters(), lr=1e-6)
        criterion = nn.MSELoss()

        """
        self.wantLSTM  = wantLSTM
        self.batch_norm= batch_norm
        params = [int(x) for x in params.split(',')]
        layers = []

        self.softmax = nn.Softmax(dim = 1)
        if self.wantLSTM:
            # Recurrent Component of the architecture
            rnns = []
            for i in range(1, len(params) - 2):
                multi = 1 if i==1 else 1
                rnn = BatchRNN(input_size    = params[i-1]*multi,
                                hidden_size  = params[i],
                                rnn_type     = nn.LSTM,
                                bidirectional= True,
                                batch_norm   = batch_norm,
                                multi        = 1,
                                dropout      = 0.0)
                rnns.append(('%d' %(i-1), rnn))
            self.rnn = nn.Sequential(OrderedDict(rnns))

            layers.append(nn.Linear(params[-3], params[-2], bias=True))
            layers.append(nn.ReLU(inplace=True))
            layers.append(nn.Linear(params[-2], params[-1], bias=True))
            mlp = nn.Sequential(*layers)
            self.mlp = nn.Sequential(SequenceWise(mlp),)

        else:
            if self.batch_norm:
                self.batch_norm = nn.BatchNorm1d(params[0])

            for i in range(1, len(params)-1):
                layers.append(nn.Linear(params[i-1], params[i], bias=True))
                layers.append(nn.ReLU(inplace=True))
            layers.append(nn.Linear(params[-2], params[-1], bias=True))
            self.mlp = nn.Sequential(*layers) 


    def forward(self, x):
        if self.wantLSTM:
            x = self.rnn(x)

        if self.batch_norm:
            x = self.batch_norm(x)
        out = self.mlp(x)
        out = out.squeeze()

        return out


class RL:
    def __init__(self, config=None):

        # Finalized config-file
        self.config= config

        self.out_size = config["num_clients_per_iteration"]
        self.wantLSTM = config['RL']['wantLSTM'] if 'wantLSTM' in config['RL'] else False
        self.replay_memory= []
        self.state_memory = []
        self.epsilon= config['RL']['initial_epsilon']
        self.step =0 
        self.runningLoss =0

        model_descriptor = config['RL']['model_descriptor_RL'] if 'model_descriptor_RL'  in config['RL'] else 'Default'
        self.model_name = os.path.join(config['RL']['RL_path'], 'rl_{}.{}.model'.format(self.out_size, model_descriptor))
        self.stats_name = os.path.join(config['RL']['RL_path'], 'rl_{}.{}.stats'.format(self.out_size, model_descriptor))

        # Initialize RL model
        self.make_model()
        self.load_saved_status()

        # Set the RL weights
        self.rl_weights=None
        self.rl_losses=None

        self.criterion = nn.MSELoss()

    def set_losses(self, losses):
        self.rl_losses=losses

    def set_weights(self, weights):
        self.rl_weights = weights

    def forward(self, state=None):
        # epsilon greedy exploration

        if self.wantLSTM:
            N = len(state)
            state.resize(1, N)
            if len(self.state_memory)==0:
                self.state_memory = np.zeros((self.config['RL']['minibatch_size'], N))
            self.state_memory = np.concatenate((self.state_memory[1:], state), axis=0)
            state = self.state_memory

        if random.random() <= self.epsilon:
            print_rank("Performed random action!")
            action= to_device(torch.rand(self.out_size))
        else:
            state = to_device(torch.from_numpy(state))
            print_rank(f'RL_state: {state.shape}')
            action= self.model(state.float())
        return action


    def train(self, batch=None):
        # save transition to replay memory
        self.replay_memory.append(batch)

        # if replay memory is full, remove the oldest transition
        if len(self.replay_memory) > self.config['RL']['max_replay_memory_size']:
            self.replay_memory.pop(0)

        # epsilon annealing
        self.epsilon *= self.config['RL']['epsilon_gamma'] if self.epsilon*self.config['RL']['epsilon_gamma']>self.config['RL']['final_epsilon'] else 1.0

        # sample random minibatch
        if self.wantLSTM:
            if len(self.replay_memory)>= self.config['RL']['minibatch_size']:
                minibatch = self.replay_memory[-self.config['RL']['minibatch_size']:]
            else:
                minibatch = self.replay_memory 
        else:
            minibatch = random.sample(self.replay_memory, min(len(self.replay_memory), self.config['RL']['minibatch_size']))

        # unpack minibatch
        state_batch  = torch.tensor(tuple(d[0] for d in minibatch)).float()
        action_batch = torch.tensor(tuple(d[1] for d in minibatch)).float()
        reward_batch = torch.tensor(tuple(d[2] for d in minibatch)).float()

        state_batch = to_device(state_batch)
        action_batch = to_device(action_batch)
        reward_batch = to_device(reward_batch)


        # set y_j to r_j for terminal state, otherwise to r_j + gamma*max(Q)
        y_batch = reward_batch

        # extract Q-value
        print_rank(f'RL state_batch: {state_batch.shape}', loglevel=logging.DEBUG)
        state_output = self.model(state_batch)
        print_rank(f'RL train shapes: {state_batch.shape} {action_batch.shape} {state_output.shape}', loglevel=logging.DEBUG)
        q_value = torch.sum(state_output * action_batch, dim=1)

        # reset gradient
        self.optimizer.zero_grad()

        # returns a new Tensor, detached from the current graph, the result will never require gradient
        y_batch = y_batch.detach()

        # calculate loss
        loss = self.criterion(q_value, y_batch)

        # do backward pass
        loss.backward()
        self.optimizer.step()

        # Tracking a running average of loss
        if self.runningLoss==0:
            self.runningLoss = loss.item()
        else:
            self.runningLoss = 0.95 * self.runningLoss + 0.05 * loss.item()
        print_rank('Running Loss for RL training process: {}'.format(self.runningLoss))

        # Decay learning rate
        self.lr_scheduler.step()


    def make_model(self):
        # make model
        self.model = NeuralNetwork(self.config['RL']['network_params'], \
                        self.config['RL']['wantLSTM'] if 'wantLSTM' in self.config['RL'] else False, \
                        self.config['RL']['batchNorm'] if 'batchNorm' in self.config['RL'] else False)
        print(self.model)
        model = to_device(model)

        # make optimizer
        self.optimizer = make_optimizer(self.config['RL']["optimizer_config"], self.model)

        # make lr_scheduler
        self.lr_scheduler = make_lr_scheduler(
                                            self.config['RL']['annealing_config'],
                                            self.optimizer,
                                            num_batches=1)


    def load_saved_status(self):
        if os.path.exists(self.model_name):
            print_rank("Resuming from checkpoint model {}".format(self.model_name))
            self.load()

        if os.path.exists(self.stats_name):
            with open(self.stats_name, 'r') as logfp: # loading the iteration no., val_loss and lr_weight
                elems = json.load(logfp)
                self.cur_iter_no= elems["i"]
                self.val_loss   = elems["val_loss"]
                self.val_cer    = elems["val_cer"]
                self.runningLoss= elems["weight"]


    def load(self):
        print_rank("Loading checkpoint: {}".format(self.model_name))
        checkpoint = torch.load(self.model_name)

        self.model.load_state_dict(checkpoint['model_state_dict'])
        if self.optimizer is not None:
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        anl_st_dict = checkpoint.get('lr_scheduler_state_dict')
        if anl_st_dict and self.lr_scheduler is not None:
            self.lr_scheduler.load_state_dict(anl_st_dict)


    def save(self, i):
        """
        Save a model as well as training information
        """

        save_state = {
                'model_state_dict' : self.model.state_dict(),
                'optimizer_state_dict' : self.optimizer.state_dict() if self.optimizer is not None else None,
                'lr_scheduler_state_dict' : self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None
            }

        outputdir = os.path.dirname(self.model_name)
        if os.path.exists(outputdir) is False:
            os.makedirs(outputdir, exist_ok=True)

        print_rank("Saving model to: {}".format(self.model_name))
        try_except_save(torch_save, state_or_model=save_state,
                                        save_path=self.model_name)

        # logging the latest best values
        print_rank(f'Saving stats to {self.stats_name}')
        with open(self.stats_name, 'w') as logfp:
            json.dump({"i":i+1,
                        "val_loss":float(self.rl_losses[0]),
                        "val_cer":float(self.rl_losses[1]),
                        "weight":float(self.runningLoss)},
                        logfp)


================================================
FILE: extensions/__init__.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from extensions.RL.RL import *
from extensions.quantization.quant import *


================================================
FILE: extensions/privacy/__init__.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import numpy as np
import torch as T
import logging
import math
import json
from utils import print_rank
from azureml.core import Run
from scipy.special import betainc, betaln

run = Run.get_context()

def compute_LDP_noise_std(eps, max_sensitivity, delta):
    return np.sqrt(2 * np.log(1.25 / delta)) * max_sensitivity / eps

    
def _beta2betainc_ratio(a, x):
    return 1 / betainc(a, a, x)


def _log_m1(d, alpha, gamma):
    return alpha * np.log(1 - gamma**2) - (d - 2) * np.log(2) - np.log(d - 1)


def _log_m2(p, tau, alpha):
    return np.log(p / (_beta2betainc_ratio(alpha, tau) - 1) - (1 - p)) + np.log(_beta2betainc_ratio(alpha, tau)) - betaln(alpha, alpha)


def _efficient_m(d, gamma, p):
    alpha = (d - 1) / 2
    tau = (1 + gamma) / 2 
    return np.exp(_log_m1(d, alpha, gamma) + _log_m2(p, tau, alpha))


def privacy_parameters(eps0, eps, d):
    exp_eps0 = np.exp(eps0)
    exp_eps = np.exp(eps)
    if exp_eps0 == np.inf:
        p0 = 1
    else:
        p0 = exp_eps0 / (1 + exp_eps0)
    if exp_eps == np.inf:
        gamma = np.sqrt(np.pi / (2 * (d - 1)))
    else:
        gamma = ((exp_eps - 1) / (exp_eps + 1)) * np.sqrt(np.pi / (2 * (d - 1)))
    return p0, gamma


def private_unit2(grad, gamma, prob):
    np.testing.assert_almost_equal(grad.norm().cpu().item(), 1, decimal=5)
    assert prob >= 0.5
    assert (0 <= gamma <= 1)
    p = T.rand(())
    while True:
        # create a uniform distriubtion over d-sphere
        V = T.normal(0, 1, grad.shape, device=grad.device)
        V = V / V.norm()
        dot_prod = T.dot(V, grad)
        if (dot_prod >= gamma and p < prob) or (dot_prod < gamma and p >= prob):
            break
    d = grad.shape[0]
    m = _efficient_m(d, gamma, prob)
    return V / m


def add_gaussian_noise(grad, eps, max_grad, delta):
    sigma = compute_LDP_noise_std(eps, max_grad, delta)
    #sigma = np.sqrt(2 * np.log(1.25 / delta)) * max_grad / eps
    noisy_grad = sigma * T.randn(grad.shape, device=grad.device) + grad
    return noisy_grad, sigma


def add_private_unit2_noise(eps, grad):
    eps0 = 0.01 * eps
    eps1 = 0.99 * eps
    samp_prob, gamma = privacy_parameters(eps0, eps1, grad.shape[0])
    return private_unit2(grad, gamma, samp_prob)


def scalar_DP(r, eps, k, r_max):
    r = np.minimum(r, r_max)
    val = k * r / r_max
    f_val = math.floor(val)
    c_val = math.ceil(val)
    J = f_val if T.rand(()) < (c_val - val) else c_val
    exp_eps = np.exp(eps)
    rand_prob = exp_eps / (exp_eps + k)
    if T.rand(()) >= rand_prob:
        while True:
            J_ = T.randint(0, k + 1, ()).item()
            if J != J_:
                J = J_
                break
    a = ((exp_eps + k) / (exp_eps - 1)) * (r_max / k)
    b = (k * (k + 1)) / (2 * (exp_eps + k))
    return a * (J - b)


def laplace_noise(max_sens, eps, vocab_size):
    return np.random.laplace(0.0, max_sens/eps, vocab_size)


def unroll_network(named_params, select_grad=False):
    # Unroll the network as 1D vector and save original values indices
    params_ids, flat_params  = {}, []
    cur_idx = 0
    for n, p in named_params:
        dat = p.grad if select_grad else p.data
        flat_params.append(dat.view(-1))
        next_idx = cur_idx + flat_params[-1].shape[0]
        params_ids[n] = (cur_idx, next_idx)
        cur_idx = next_idx
    return T.cat(flat_params), params_ids


def update_network(named_params, params_ids, flat_params, apply_to_grad=False):
    # Roll back the network parameters to layers
    for n, p in named_params:
        s_id, e_id = params_ids[n]
        if apply_to_grad:
            p.grad.copy_(flat_params[s_id : e_id].view(*p.grad.shape))
        else:
            p.data.copy_(flat_params[s_id : e_id].view(*p.data.shape))


def apply_global_dp(config, model, num_clients_curr_iter, select_grad=True, metric_logger=None):
    # Add global DP noise here
    dp_config = config.get('dp_config', None)
    if dp_config is not None and dp_config.get('enable_global_dp', False):
        # enable_local_dp must be enabled - client-side gradient clipping must be enabled.
        assert (dp_config['enable_local_dp'])
        # Unroll the network grads as 1D vectors
        flat_grad, params_ids = unroll_network(model.named_parameters(), select_grad=select_grad)

        sigma = dp_config['global_sigma']
        max_grad = dp_config['max_grad']
        noise_scale = sigma * max_grad / num_clients_curr_iter
        noise = T.normal(0, 1, flat_grad.shape, device=flat_grad.device) * noise_scale
        flat_noisy_grad = flat_grad + noise
        print_rank('Error from noise {} is {}. grad norm: {} noisy_grad norm: {}'.format(noise_scale, (
                    flat_grad - flat_noisy_grad).norm(), flat_grad.norm(), flat_noisy_grad.norm()))

        # Return back to the network gradients
        update_network(model.named_parameters(), params_ids, flat_noisy_grad,
                               apply_to_grad=select_grad)

        if metric_logger is None:
            metric_logger = Run.get_context().log
        metric_logger('Gradient Norm', flat_grad.norm().cpu().item())


def apply_local_dp(trainer, weight, dp_config, add_weight_noise):
    '''Apply client-side DP, possibly given a data-dependent aggregation weight

    Args:
        trainer (core.Trainer object): trainer on client.
        dp_config (dict): DP config on original config file.
        add_weight_noise (bool): whether noise should be added to aggregation weight.
    '''

    # Unroll the network grads as 1D vectors
    flat_grad, params_ids = unroll_network(trainer.model.named_parameters(), select_grad=True)
    grad_norm = flat_grad.norm().cpu().item()

    if dp_config['eps'] < 0:
        # clip, but don't add noise
        if grad_norm > dp_config['max_grad']:
            flat_grad = flat_grad * (dp_config['max_grad'] / grad_norm)
            update_network(trainer.model.named_parameters(), params_ids, flat_grad, apply_to_grad=True)

    else:
        # Get Gaussian LDP noise
        dp_eps = dp_config['eps']
        delta = dp_config.get('delta', 1e-7) # TODO pre-compute in config
        weight_ = weight

        # Scaling the weight down so we don't impact the noise too much
        weight = dp_config.get('weight_scaler', 1) * weight
        weight = min(dp_config['max_weight'], weight)
        flat_noisy_grad = dp_config['max_grad'] * (flat_grad / flat_grad.norm())
        max_sensitivity = np.sqrt(dp_config['max_grad']**2 + (dp_config['max_weight']**2 if add_weight_noise else 0.0))
        flat_noisy_grad = T.cat([flat_noisy_grad, T.tensor([weight], device=flat_noisy_grad.device)], dim=0)
        flat_noisy_grad, _ = add_gaussian_noise(flat_noisy_grad, dp_eps, max_sensitivity, delta)
        weight = min(max(flat_noisy_grad[-1].item(), dp_config['min_weight']), dp_config['max_weight'])

        # Scaling the weight back up after noise addition (This is a DP-protect transformation)
        weight = weight / dp_config.get('weight_scaler', 1)
        if not add_weight_noise:
            weight = weight_
        flat_noisy_grad = flat_noisy_grad[:-1]

        print_rank('Cosine error from noise {}'.format(T.nn.functional.cosine_similarity(flat_grad, flat_noisy_grad, dim=0)), loglevel=logging.DEBUG)
        print_rank('Error from noise is {}'.format((flat_grad-flat_noisy_grad).norm()), loglevel=logging.DEBUG)
        print_rank('weight is {} and noisy weight is {}'.format(weight_, weight), loglevel=logging.DEBUG)

        # Return back to the network
        update_network(trainer.model.named_parameters(), params_ids, flat_noisy_grad, apply_to_grad=True)

    return weight


def update_privacy_accountant(config, num_clients, curr_iter, num_clients_curr_iter):
    # Privacy accounting starts here
    # We will dump all the needed parameters to the log so as not to slow down training.
    dp_config = config.get('dp_config', None)
    if dp_config is not None and dp_config.get('enable_global_dp', False) or dp_config.get('enable_local_dp',
                                                                                           False):
        from math import sqrt, exp, log
        import extensions.privacy.analysis as privacy_analysis

        K = 1  # from DP perspective each user is contributing one gradient
        B = num_clients_curr_iter  # batch size
        n = num_clients
        T = curr_iter + 1
        _delta = dp_config.get('delta', min(1e-7, 1. / (n * log(n))))  # TODO should be precomputed in config
        if dp_config.get('global_sigma', None) is None:
            max_sensitivity = np.sqrt(dp_config['max_grad'] ** 2 + dp_config['max_weight'] ** 2)
            noise_scale = compute_LDP_noise_std(dp_config['eps'], max_sensitivity, _delta)
            global_sigma = noise_scale * np.sqrt(B) / max_sensitivity
        else: 
            global_sigma = dp_config['global_sigma']
            noise_scale = global_sigma * dp_config['max_grad'] / B

        try:
            mu = K * B / n * sqrt(T * exp((1. / global_sigma) ** 2 - 1))
        except OverflowError:
            print_rank(f"Error computing mu {global_sigma} {K} {B} {n} {T}")
            mu = -1

        orders = ([1.25, 1.5, 1.75, 2., 2.25, 2.5, 3., 3.5, 4., 4.5] + list(range(5, 64)) + [128, 256, 512])
        q = B / n
        _sigma = global_sigma  # was: noise_scale but we should apply the noise multiplier.
        rdp = privacy_analysis.compute_rdp(q, _sigma, T, orders)

        rdp_epsilon, opt_order = privacy_analysis.get_privacy_spent(orders, rdp, _delta)

        props = {
            'dp_global_K': K,  # gradients per user
            'dp_global_B': B,  # users per batch
            'dp_global_n': n,  # total users
            'dp_global_T': T,  # how many iterations
            'dp_sigma': _sigma,  # noise_multiplier. Should be combined global+local sigma.
            'dp_global_mu': mu,
            # 'dp_epsilon_fdp': fdp_epsilon,
            'dp_epsilon_rdp': rdp_epsilon,
            # 'dp_epsilon_exact': exact_eps,
            'dp_opt_order': opt_order,
            'dp_delta': _delta,
            'dp_noise_scale': noise_scale  # Note: not needed for accounting.
        }

        print_rank(f'DP accounting: {json.dumps(props)}')
        for k in props:
            run.log(k, props[k])

        return rdp_epsilon
    else:
        return None


================================================
FILE: extensions/privacy/analysis.py
================================================
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

"""
*Borrowed from Facebook Opacus, which in turn borrowed from Tensorflow Privacy.  
*Facebook's original notice follows below.


*Based on Google's TF Privacy:* https://github.com/tensorflow/privacy/blob/master/tensorflow_privacy/privacy/analysis/rdp_accountant.py.
*Here, we update this code to Python 3, and optimize dependencies.*

Functionality for computing Renyi Differential Privacy (RDP) of an additive
Sampled Gaussian Mechanism (SGM).

Example:
    Suppose that we have run an SGM applied to a function with L2-sensitivity of 1.

    Its parameters are given as a list of tuples
    ``[(q_1, sigma_1, steps_1), ..., (q_k, sigma_k, steps_k)],``
    and we wish to compute epsilon for a given target delta.

    The example code would be:

    >>> max_order = 32
    >>> orders = range(2, max_order + 1)
    >>> rdp = np.zeros_like(orders, dtype=float)
    >>> for q, sigma, steps in parameters:
    >>>     rdp += privacy_analysis.compute_rdp(q, sigma, steps, orders)
    >>> epsilon, opt_order = privacy_analysis.get_privacy_spent(orders, rdp, delta)

"""

import math
import numpy as np
from scipy import special
from typing import List, Tuple, Union

########################
# LOG-SPACE ARITHMETIC #
########################


def _log_add(logx: float, logy: float) -> float:
    r"""Adds two numbers in the log space.

    Args:
        logx: First term in log space.
        logy: Second term in log space.

    Returns:
        Sum of numbers in log space.
    """
    a, b = min(logx, logy), max(logx, logy)
    if a == -np.inf:  # adding 0
        return b
    # Use exp(a) + exp(b) = (exp(a - b) + 1) * exp(b)
    return math.log1p(math.exp(a - b)) + b  # log1p(x) = log(x + 1)


def _log_sub(logx: float, logy: float) -> float:
    r"""Subtracts two numbers in the log space.

    Args:
        logx: First term in log space. Expected to be greater than the second term.
        logy: First term in log space. Expected to be less than the first term.

    Returns:
        Difference of numbers in log space.

    Raises:
        ValueError
            If the result is negative.
    """
    if logx < logy:
        raise ValueError("The result of subtraction must be non-negative.")
    if logy == -np.inf:  # subtracting 0
        return logx
    if logx == logy:
        return -np.inf  # 0 is represented as -np.inf in the log space.

    try:
        # Use exp(x) - exp(y) = (exp(x - y) - 1) * exp(y).
        return math.log(math.expm1(logx - logy)) + logy  # expm1(x) = exp(x) - 1
    except OverflowError:
        return logx


def _compute_log_a_for_int_alpha(q: float, sigma: float, alpha: int) -> float:
    r"""Computes :math:`log(A_\alpha)` for integer ``alpha``.

    Notes:
        Note that
        :math:`A_\alpha` is real valued function of ``alpha`` and ``q``,
        and that 0 < ``q`` < 1.

        Refer to Section 3.3 of https://arxiv.org/pdf/1908.10530.pdf for details.

    Args:
        q: Sampling rate of SGM.
        sigma: The standard deviation of the additive Gaussian noise.
        alpha: The order at which RDP is computed.

    Returns:
        :math:`log(A_\alpha)` as defined in Section 3.3 of
        https://arxiv.org/pdf/1908.10530.pdf.
    """

    # Initialize with 0 in the log space.
    log_a = -np.inf

    for i in range(alpha + 1):
        log_coef_i = (
            math.log(special.binom(alpha, i))
            + i * math.log(q)
            + (alpha - i) * math.log(1 - q)
        )

        s = log_coef_i + (i * i - i) / (2 * (sigma ** 2))
        log_a = _log_add(log_a, s)

    return float(log_a)


def _compute_log_a_for_frac_alpha(q: float, sigma: float, alpha: float) -> float:
    r"""Computes :math:`log(A_\alpha)` for fractional ``alpha``.

    Notes:
        Note that
        :math:`A_\alpha` is real valued function of ``alpha`` and ``q``,
        and that 0 < ``q`` < 1.

        Refer to Section 3.3 of https://arxiv.org/pdf/1908.10530.pdf for details.

    Args:
        q: Sampling rate of SGM.
        sigma: The standard deviation of the additive Gaussian noise.
        alpha: The order at which RDP is computed.

    Returns:
        :math:`log(A_\alpha)` as defined in Section 3.3 of
        https://arxiv.org/pdf/1908.10530.pdf.
    """
    # The two parts of A_alpha, integrals over (-inf,z0] and [z0, +inf), are
    # initialized to 0 in the log space:
    log_a0, log_a1 = -np.inf, -np.inf
    i = 0

    z0 = sigma ** 2 * math.log(1 / q - 1) + 0.5

    while True:  # do ... until loop
        coef = special.binom(alpha, i)
        log_coef = math.log(abs(coef))
        j = alpha - i

        log_t0 = log_coef + i * math.log(q) + j * math.log(1 - q)
        log_t1 = log_coef + j * math.log(q) + i * math.log(1 - q)

        log_e0 = math.log(0.5) + _log_erfc((i - z0) / (math.sqrt(2) * sigma))
        log_e1 = math.log(0.5) + _log_erfc((z0 - j) / (math.sqrt(2) * sigma))

        log_s0 = log_t0 + (i * i - i) / (2 * (sigma ** 2)) + log_e0
        log_s1 = log_t1 + (j * j - j) / (2 * (sigma ** 2)) + log_e1

        if coef > 0:
            log_a0 = _log_add(log_a0, log_s0)
            log_a1 = _log_add(log_a1, log_s1)
        else:
            log_a0 = _log_sub(log_a0, log_s0)
            log_a1 = _log_sub(log_a1, log_s1)

        i += 1
        if max(log_s0, log_s1) < -30:
            break

    return _log_add(log_a0, log_a1)


def _compute_log_a(q: float, sigma: float, alpha: float) -> float:
    r"""Computes :math:`log(A_\alpha)` for any positive finite ``alpha``.

    Notes:
        Note that
        :math:`A_\alpha` is real valued function of ``alpha`` and ``q``,
        and that 0 < ``q`` < 1.

        Refer to Section 3.3 of https://arxiv.org/pdf/1908.10530.pdf
        for details.

    Args:
        q: Sampling rate of SGM.
        sigma: The standard deviation of the additive Gaussian noise.
        alpha: The order at which RDP is computed.

    Returns:
        :math:`log(A_\alpha)` as defined in the paper mentioned above.
    """
    if float(alpha).is_integer():
        return _compute_log_a_for_int_alpha(q, sigma, int(alpha))
    else:
        return _compute_log_a_for_frac_alpha(q, sigma, alpha)


def _log_erfc(x: float) -> float:
    r"""Computes :math:`log(erfc(x))` with high accuracy for large ``x``.

    Helper function used in computation of :math:`log(A_\alpha)`
    for a fractional alpha.

    Args:
        x: The input to the function

    Returns:
        :math:`log(erfc(x))`
    """
    return math.log(2) + special.log_ndtr(-x * 2 ** 0.5)


def _compute_rdp(q: float, sigma: float, alpha: float) -> float:
    r"""Computes RDP of the Sampled Gaussian Mechanism at order ``alpha``.

    Args:
        q: Sampling rate of SGM.
        sigma: The standard deviation of the additive Gaussian noise.
        alpha: The order at which RDP is computed.

    Returns:
        RDP at order ``alpha``; can be np.inf.
    """
    if q == 0:
        return 0

    # no privacy
    if sigma == 0:
        return np.inf

    if q == 1.0:
        return alpha / (2 * sigma ** 2)

    if np.isinf(alpha):
        return np.inf

    return _compute_log_a(q, sigma, alpha) / (alpha - 1)


def compute_rdp(
    q: float, noise_multiplier: float, steps: int, orders: Union[List[float], float]
) -> Union[List[float], float]:
    r"""Computes Renyi Differential Privacy (RDP) guarantees of the
    Sampled Gaussian Mechanism (SGM) iterated ``steps`` times.

    Args:
        q: Sampling rate of SGM.
        noise_multiplier: The ratio of the standard deviation of the
            additive Gaussian noise to the L2-sensitivity of the function
            to which it is added. Note that this is same as the standard
            deviation of the additive Gaussian noise when the L2-sensitivity
            of the function is 1.
        steps: The number of iterations of the mechanism.
        orders: An array (or a scalar) of RDP orders.

    Returns:
        The RDP guarantees at all orders; can be ``np.inf``.
    """
    if isinstance(orders, float):
        rdp = _compute_rdp(q, noise_multiplier, orders)
    else:
        rdp = np.array([_compute_rdp(q, noise_multiplier, order) for order in orders])

    return rdp * steps


def get_privacy_spent(
    orders: Union[List[float], float], rdp: Union[List[float], float], delta: float
) -> Tuple[float, float]:
    r"""Computes epsilon given a list of Renyi Differential Privacy (RDP) values at
    multiple RDP orders and target ``delta``.

    Args:
        orders: An array (or a scalar) of orders (alphas).
        rdp: A list (or a scalar) of RDP guarantees.
        delta: The target delta.

    Returns:
        Pair of epsilon and optimal order alpha.

    Raises:
        ValueError
            If the lengths of ``orders`` and ``rdp`` are not equal.
    """
    orders_vec = np.atleast_1d(orders)
    rdp_vec = np.atleast_1d(rdp)

    if len(orders_vec) != len(rdp_vec):
        raise ValueError(
            f"Input lists must have the same length.\n"
            f"\torders_vec = {orders_vec}\n"
            f"\trdp_vec = {rdp_vec}\n"
        )

    eps = rdp_vec - math.log(delta) / (orders_vec - 1)

    # special case when there is no privacy
    if np.isnan(eps).all():
        return np.inf, np.nan

    idx_opt = np.nanargmin(eps)  # Ignore NaNs
    return eps[idx_opt], orders_vec[idx_opt]


================================================
FILE: extensions/privacy/dp_kmeans.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import sys
import numpy as np
from scipy.special import gammainc
from sklearn.cluster import KMeans
from sklearn import cluster as skcluster


kmeans_single = skcluster._kmeans.lloyd_iter_chunked_dense


def sample(ndim, r, num_samples=1):
    x = np.random.normal(size=(num_samples, ndim))
    ssq = np.sum(x**2,axis=1)
    fr = r*gammainc(ndim/2,ssq/2)**(1/ndim)/np.sqrt(ssq)
    if num_samples > 1:
        fr = np.tile(fr.reshape(num_samples,1),(1,ndim))
    return  np.multiply(x,fr)


def sphere_packing_initialization(n_clusters, n_dim, min_cluster_radius,
                                  max_space_size, max_failed_cases, verbose=None):
    a, max_r = min_cluster_radius, max_space_size
    centers = np.empty((n_clusters, n_dim))
    cluster_id = 0
    fail_count = 0
    r = max_r - a
    while cluster_id < n_clusters:
        v = sample(n_dim, r)
        if cluster_id > 0 and np.min(np.linalg.norm(centers[:cluster_id, :] - v, axis=-1)) < 2 * a:
            fail_count += 1
            if fail_count >= max_failed_cases:
                fail_count = 0
                cluster_id = 0
                a = a / 2 # TODO Use binary search to find maximum a that don't fail (vaguely discribed in the diff-p kmeas paper)
                if verbose:
                    print(f'Failing to pack, halving min_cluster_radius to {a}')
                r = max_r - a
            continue
     
        centers[cluster_id] = v
        cluster_id += 1
    if verbose:
        print('Final min_cluster_radius', a)
    return centers, a


def add_gaussian_noise(centers_new, weight_in_clusters, eps,
                       max_cluster_l2, max_sample_weight,
                       cluster_to_weight_ratio=-1, delta=1e-7, verbose=None):
    scaler = 1
    
    if cluster_to_weight_ratio > 0:
        # Compute the scaler to apply to the sample weights
        scaler = max_cluster_l2 / (max_sample_weight * cluster_to_weight_ratio)
    max_sample_weight *= scaler
   
    max_l2_sensitivity = np.sqrt(max_cluster_l2 ** 2 + max_sample_weight ** 2)
    sigma = np.sqrt(2 * np.log(1.25 / delta)) * max_l2_sensitivity / eps
    if verbose:
        print('cluster_to_weight_ratio', cluster_to_weight_ratio,
              'scaler', scaler,
              'max_sample_weight', max_sample_weight,
              'max_l2_sensitivity', max_l2_sensitivity,
              'sigma', sigma)
    centers_sum = (centers_new * weight_in_clusters.reshape(-1, 1)) + np.random.normal(scale=sigma, size=centers_new.shape)
    # Scale the sample weights by scaling the cluster weights, since (s*w1 + s*w2, ...) == s*(w1 + w2 + ...), where s is the scaler
    # Add noise then rescale back. We should never get negative weights because of the noise
    weight_in_clusters[:] = np.maximum(1e-10, (weight_in_clusters * scaler) + np.random.normal(scale=sigma, size=weight_in_clusters.shape)) / scaler
    centers_new[:] = centers_sum / weight_in_clusters.reshape(-1, 1)


def DPKMeans(n_dim, eps, max_cluster_l2, max_sample_weight=1.0,
             max_iter=300, cluster_to_weight_ratio=-1, n_clusters=8,
             tol=1e-4, verbose=0, delta=1e-7, max_failed_cases=300,
             min_cluster_radius=None, **kwargs):
    """Differentially private KMeans

    Initialise the differentially-private Sklearn.cluster.KMeans overriding lloyd algorithm,
    by adding Gaussian noise.

    Parameters
    ---------
    
    n_dim : int
        The dimension size of the input space
    
    eps : float
        The privacy loss (epsilon) per iteration. Currently only fix epsilon is implemented so
        the overall privacy loss <= eps * max_iter

    max_cluster_l2 : float
        The maximum l2 norm of any example vector that we want to cluster

    max_sample_weight : float
        The maximum weight of a sample default=1.0

    max_iter : int, default=300
        Maximum number of iterations of the k-means algorithm for a
        single run.

    cluster_to_weight_ratio : float, default=-1
        The ratio max_cluster_l2 / max_sample_weight used to scale the cluster counts before adding the noise
        If it is set to -1, do not scale the counts

    n_clusters : int, default=8
        The number of clusters to form as well as the number of
        centroids to generate.

    tol : float, default=1e-4
        Relative tolerance with regards to Frobenius norm of the difference
        in the cluster centers of two consecutive iterations to declare
        convergence.

    verbose : int, default=0
        Verbosity mode.

    delta : float, default=1e-7
        Gaussian mechanism delta or probability of failure, should be set < 1/num of examples

    max_failed_cases : int, default=300
        The number of sampling trails in sphere packing before halving the minimum cluster radius

    min_cluster_radius : float, default=None (= max_cluster_l2 / n_clusters)
        Half the minimum distance between clusters centers
    """

    if min_cluster_radius is None:
        min_cluster_radius = max_cluster_l2 / n_clusters

    # Initalise the cluster centers using sphere packing
    init_centers, min_cluster_radius = sphere_packing_initialization(n_clusters, n_dim,
                                                                     min_cluster_radius,
                                                                     max_cluster_l2,
                                                                     max_failed_cases,
                                                                     verbose)

    final_eps = [0] # To keep track of the actual number of iterations until convergence
    def modified_lloyd(X, sample_weight, x_squared_norms, centers, centers_new,
                       weight_in_clusters, labels, center_shift, n_threads,
                       update_centers=True):

        # Clip the maximum client contribution to the cluster count
        sample_weight = np.minimum(sample_weight, max_sample_weight)
        
        if not update_centers:
            return kmeans_single(X, sample_weight, x_squared_norms, centers, centers_new,
                                weight_in_clusters, labels, center_shift, n_threads, update_centers=False)
        
        
        # Scale input vectors if necessary
        if np.max(x_squared_norms) > max_cluster_l2 ** 2:
            if verbose:
                print(f'Scaling the input examples as their l2 norm is larger than {max_cluster_l2}')
            scaler_squared = np.minimum(max_cluster_l2 ** 2 / x_squared_norms, 1.0)
            x_squared_norms[:] = x_squared_norms * scaler_squared
            X[:] = X * np.sqrt(scaler_squared).reshape(-1, 1)
        
        kmeans_single(X, sample_weight, x_squared_norms, centers, centers_new,
                      weight_in_clusters, labels, center_shift, n_threads)

        # Add noise to centers_new
        add_gaussian_noise(centers_new, weight_in_clusters, eps,
                           max_cluster_l2, max_sample_weight,
                           cluster_to_weight_ratio, delta=delta,
                           verbose=verbose)

        # Other values need to be changed because of that: center_shift, labels, 
        center_shift[:] = np.linalg.norm(centers - centers_new, axis=-1)
        # Run E-step of kmeans to get the new labels
        kmeans_single(X, sample_weight, x_squared_norms, centers, centers_new,
                    weight_in_clusters, labels, center_shift, n_threads, update_centers=False)

        # Increment the number of iterations
        final_eps[0] += eps

    sys.modules[KMeans.__module__].lloyd_iter_chunked_dense = modified_lloyd

    kmeans = KMeans(n_clusters=n_clusters,
                    algorithm='full',
                    init=init_centers,
                    verbose=verbose,
                    max_iter=max_iter,
                    tol=tol, **kwargs)
    kmeans.eps = final_eps
    return kmeans


def resetKMeans():
    sys.modules[KMeans.__module__].lloyd_iter_chunked_dense = kmeans_single

================================================
FILE: extensions/privacy/metrics.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
import numpy as np
import torch as T
from copy import deepcopy
from utils import make_optimizer, print_rank

def extract_indices_from_embeddings(gradients, batch, embed_size, vocab_size):
    # Extract the Input gradient embeddings
    batch = T.cat([b.view(-1) for b in batch]).cpu().detach().numpy()
    embed_grad = gradients[:embed_size * vocab_size].reshape(vocab_size, embed_size)
    valid_batch = batch[batch > 0]
    tot_valid_tokens, tot_tokens = len(valid_batch), len(batch)
    # The embedding gradients of the indices seen in the batch have higher l2 norm,
    # because dl/dembed_i = dl/dembed_input_i * (if word_i is in batch) + dl/dembed_output_i
    extracted_indices = T.argsort(embed_grad.norm(dim=-1), descending=True)[:tot_tokens].cpu().detach().numpy()
    # Get the overlap ratio
    extracted_ratio = np.isin(valid_batch, extracted_indices).mean()
    # Find True positive extracted indices
    return extracted_ratio, np.intersect1d(extracted_indices, valid_batch)


def compute_perplexity(encoded_batch, model):
    outputs = model.inference(encoded_batch)    
    (batch_size, seq_len, vocab_size) = outputs['output'].shape    
    perplex = T.nn.functional.log_softmax(outputs['output'], dim=-1)
    return perplex.reshape(-1, vocab_size)[np.arange(batch_size * seq_len),
                    encoded_batch.reshape(-1)].reshape(batch_size, seq_len)


def practical_epsilon_leakage(original_params, model, encoded_batches, is_weighted_leakage=True,
                              max_ratio=1e9, optimizer_config=None):
    # Copy the gradients and save the model.
    current_params = deepcopy(model.state_dict())
    current_gradients = dict((n,p.grad.clone().detach()) for n,p in model.named_parameters())
    model.load_state_dict(original_params)
    pre_perplex, post_perplex = [], []
    # This is just to initialise the gradients
    model.loss(encoded_batches[0][:1]).backward()
    model.zero_grad()
    tolerance = 1 / max_ratio
    max_leakage = 0
    with T.no_grad():
        # Original model before training on client
        for encoded_batch in encoded_batches:
            pre_perplex.append(compute_perplexity(encoded_batch, model))
        # The attacker doesn't not he optimal gradient magnitude but using Adamax with high lr, is proved to be effective    
        for n, p in model.named_parameters():
            p.grad = current_gradients[n] #.grad
            print_rank('grad l2: {}'.format(p.grad), loglevel=logging.DEBUG)
        if optimizer_config is None:
            optimizer_config = {'lr': 0.03, 'amsgrad': False, 'type': 'adamax'}
        #T.optim.Adamax(model.parameters(), lr=optim_lr).step()
        make_optimizer(optimizer_config, model).step()
        #model.zero_grad()
        # The model after training on the client data
        for encoded_batch in encoded_batches:
            post_perplex.append(compute_perplexity(encoded_batch, model))
      
        for pre, post in zip(pre_perplex, post_perplex):
            # Compute the ratio of preplexity and weight it be the probability of correctly predicting the word
            leakage = ((pre + tolerance) / (post + tolerance)).clamp_(0, max_ratio)
            print_rank('perplexities leakage: {} '.format(leakage), loglevel=logging.DEBUG)
            if is_weighted_leakage:
                weight_leakage = T.max(pre.exp(), post.exp()) * leakage
            else:
                weight_leakage = leakage
            max_leakage = max(max_leakage, weight_leakage.max().item())
    print_rank('raw max leakage: {}'.format(max_leakage), loglevel=logging.DEBUG)
    model.load_state_dict(current_params)
    for n,p in model.named_parameters():
        p.grad = current_gradients[n]
    # WE return the log to match epsilon
    return max(np.log(max_leakage), 0)

================================================
FILE: extensions/quantization/quant.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
import torch
from utils import print_rank
from typing import Optional, Tuple

def quant_model(
        model: torch.nn.Module,
        quant_bits: int = 8,
        quant_threshold: Optional[int] = None,
        global_stats: bool = False
    ):
    '''Quantize the gradients using the desired number of bits.

    Nothing is returned as gradients inside :code:`model` are modified
    in-place.

    Args:
        model: model which gradients we want to quantize.
        quant_bits: how many bits will we use to quantize the gradients.
        quant_threshold: fraction of components to be set to zero; defaults to
            None, in which case no quantization happens.
        global_stats: use a single histogram for all layers when binning,
            defaults to False.
    '''

    # If no `quant_threshold`, does nothing
    if quant_threshold is None:
        return
    print_rank('Performing Gradient Quantization with Prob. Threshold: {}'.format(
        quant_threshold), loglevel=logging.INFO)

    # If `global_stats` is true, min/max and thresh are computed across all layers
    if global_stats:
        flattened_grad = torch.cat([p.grad.data.flatten() for p in model.parameters()])
        min_grad, max_grad, thresh = find_min_max_gradient(flattened_grad,
            quant_threshold)

    # Loop through all layers
    for p in model.parameters():
        if not global_stats:
            min_grad, max_grad, thresh = find_min_max_gradient(p.grad.data,
                quant_threshold)

        # Perform binning and sparsification of components
        binned_grad = quant_bins(p.grad.data, 2 ** quant_bits, min_grad, max_grad)
        p.grad = torch.where(torch.abs(p.grad.data) > thresh, binned_grad,
            torch.tensor(0.).to(p.grad))


def find_min_max_gradient(
        gradient: torch.Tensor,
        quant_threshold: Optional[float] = None
    ) -> Tuple[float, float, float]:
    '''Get min and max gradients, as well as threshold gradient.

    Args:
        gradient: tensor over which statistics will be computed.
        quant_threshold: which quantile to look for to compute threshold, must
            be between 0 and 1.
    '''

    # Computes min/max and quantile corresponding to `quant_threshold`
    min_grad, max_grad = gradient.min(), gradient.max()
    thresh = torch.quantile(torch.abs(gradient), quant_threshold)

    print_rank('Min. and Max. Gradients: {}, {}'.format(min_grad, max_grad),
        loglevel=logging.INFO)
    print_rank('Grad. Threshold: {}'.format(thresh), loglevel=logging.INFO)

    return min_grad, max_grad, thresh


def quant_bins(
        gradients: torch.Tensor,
        n_bins: int,
        min_grad: float,
        max_grad: float
    ) -> torch.Tensor:
    '''Perform quantization using binning.

    Creates histogram with `n_bins` bins between `min_grad` and `max_grad`.
    Returns a tensor similar to gradients but with components corresponding to
    bin labels.

    Args:
        gradients: tensor we want to quantize.
        n_bins: how many bins to use for binning.
        min_grad: min. value for bins.
        max_grad: max. value for bins.
    '''

    # We remove half bin width, as bucketize always takes the ceil instead of rounding
    bin_labels = torch.linspace(min_grad, max_grad, n_bins).to(gradients)
    bin_width = bin_labels[1] - bin_labels[0]
    grad_bins = torch.bucketize(gradients - .5 * bin_width, bin_labels, right=False)

    return bin_labels[grad_bins]


================================================
FILE: requirements.txt
================================================
torch==1.11.0
mpi4py
easydict
scipy
psutil
transformers
torchvision
pandas
h5py
sphinx_rtd_theme
azureml-core
azureml-defaults
pyyaml
scikit-learn
cerberus
protobuf
sentencepiece
googledrivedownloader
wget


================================================
FILE: testing/README.md
================================================
## Information

The tests are designed to evaluate the operation of the tasks, not the performance. Therefore, we are using dummy data to run all tasks. In order to have ralistic results about the behaviour of each experiment, please follow the instructions provided in the README.md  file inside each experiment folder, for downloading the recommended datasets. 

## Setup Instructions for Pytest
1. Run create_data.py in order to download and preprocess the dummy training and testing datasets that will be used. Make sure to indicate the task name. The example below shows how to create the data for the ```nlg_gru``` task.

``` python
    python create_data.py --task nlg_gru
```
2. The script ```test_e2e_trainer.py``` is designed to run the test over all tasks, therefore you need to run Step 1 for each experiment first).
3. Run ```pytest -v -s``` to perfor the local test.


================================================
FILE: testing/build_vocab.py
================================================
"""Builds vocabulary file from data."""

import argparse
import collections
import json
import os

def build_counter(train_data, initial_counter=None):
    train_tokens = []
    for u in train_data:
        for c in train_data[u]['x']:
            train_tokens.extend([s for s in c])

    all_tokens = []
    for i in train_tokens:
        all_tokens.extend(i)    
    train_tokens = []

    if initial_counter is None:
        counter = collections.Counter()
    else:
        counter = initial_counter

    counter.update(all_tokens)
    all_tokens = []

    return counter


def build_vocab(counter, vocab_size=10000):
    pad_symbol, unk_symbol = 0, 1
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    count_pairs = count_pairs[:(vocab_size - 2)] # -2 to account for the unknown and pad symbols

    words, _ = list(zip(*count_pairs))

    vocab = {}
    vocab['<PAD>'] = pad_symbol
    vocab['<UNK>'] = unk_symbol

    for i, w in enumerate(words):
        if w != '<PAD>':
            vocab[w] = i + 1

    return {'vocab': vocab, 'size': vocab_size, 'unk_symbol': unk_symbol, 'pad_symbol': pad_symbol}


def load_leaf_data(file_path):
    with open(file_path) as json_file:
        data = json.load(json_file)
        to_ret = data['user_data']
        data = None
    return to_ret


def save_vocab(vocab, target_dir):
    os.makedirs(target_dir, exist_ok=True)
    with open('./models/vocab_reddit.vocab', 'w') as outV:
        outV.write('<OOV>\n')
        for t in vocab['vocab'].keys():
            outV.write(t+'\n')


def main():
    args = parse_args()

    json_files = [f for f in os.listdir(args.data_dir) if f.endswith('.json')]
    json_files.sort()

    counter = None
    train_data = {}
    for f in json_files:
        print('loading {}'.format(f))
        train_data = load_leaf_data(os.path.join(args.data_dir, f))
        print('counting {}'.format(f))
        counter = build_counter(train_data, initial_counter=counter)
        print()
        train_data = {}

    if counter is not None:
        vocab = build_vocab(counter, vocab_size=args.vocab_size)
        save_vocab(vocab, args.target_dir)
    else:
        print('No files to process.')


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument('--data-dir', 
        help='dir with training file;',
        type=str,
        required=True)
    parser.add_argument('--vocab-size', 
        help='size of the vocabulary;',
        type=int,
        default=10000,
        required=False)
    parser.add_argument('--target-dir', 
        help='dir with training file;',
        type=str,
        default='./',
        required=False)

    return parser.parse_args()

if __name__ == '__main__':
    main()


================================================
FILE: testing/create_data.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import os
import csv
import json
import random
import argparse
import platform
from collections import OrderedDict
from itertools import islice

import tqdm
import h5py
import torchvision
import torchvision.transforms as transforms
from google_drive_downloader import GoogleDriveDownloader as gdd

def get_arg_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser()
    parser.add_argument("--task")
    return parser

def reduce_users(file):

    with open(file, 'r') as f:
            json_file = json.load(f)

    num_samples = json_file['num_samples'][0:25]
    user_data = dict(OrderedDict(islice(json_file['user_data'].items(), 0, 25)))
    users_list = list(user_data.keys())

    return users_list, num_samples, user_data

def _process_and_save_to_disk(dataset, n_users, exp, output):
    '''Process a Torchvision dataset to expected format and save to disk'''

    # Split training data equally among all users
    total_samples = len(dataset)
    samples_per_user = total_samples // n_users
    assert total_samples % n_users == 0

    # Function for getting a given user's data indices
    user_idxs = lambda user_id: slice(user_id * samples_per_user, (user_id + 1) * samples_per_user)

    data_dict = {  # the data is expected to have this format
        'users' : [f'{user_id:04d}' for user_id in range(n_users)],
        'num_samples' : n_users * [samples_per_user],
        'user_data' : {f'{user_id:04d}': dataset.data[user_idxs(user_id)].tolist() if exp =="classif_cnn" else dataset.data[user_idxs(user_id)] for user_id in range(n_users)},
        'user_data_label': {f'{user_id:04d}': dataset.targets[user_idxs(user_id)] for user_id in range(n_users)},
    }

    with h5py.File(output + '.hdf5', 'w') as hdf5_file:
        _dump_dict_to_hdf5(data_dict=data_dict, hdf5_file=hdf5_file)


def _dump_dict_to_hdf5(data_dict: dict, hdf5_file: h5py.File):
    '''Dump dict with expected structure to HDF5 file'''

    hdf5_file.create_dataset('users', data=data_dict['users'])
    hdf5_file.create_dataset('num_samples', data=data_dict['num_samples'])

    # Store actual data in groups
    user_data_group = hdf5_file.create_group('user_data')
    for user, user_data in tqdm.tqdm(data_dict['user_data'].items()):
        user_subgroup = user_data_group.create_group(user)
        user_subgroup.create_dataset('x', data=user_data) 

    user_data_label_group = hdf5_file.create_group('user_data_label')
    for user, user_data_label in tqdm.tqdm(data_dict['user_data_label'].items()):
        user_data_label_group.create_dataset(user, data=user_data_label)

class HeartDataSet: 
    def __init__(self, heartdata, cutoff):
        self.data = [row[:187] for row in heartdata][:cutoff]
        self.targets = [int(float(row[187])) for row in heartdata][:(round(len(heartdata), -3))][:cutoff]

    def __len__(self):
        return len(self.data)  

def main():

    parser = get_arg_parser()
    args = parser.parse_args()
    args = vars(args)
    exp = args["task"]

    # Create data folder
    os.system("mkdir data")

    if exp == "nlg_gru" or exp == "mlm_bert":
        
        # Download preprocessed reddit dataset by LEAF: A Benchmark for Federated Settings
        gdd.download_file_from_google_drive(file_id='1ISzp69JmaIJqBpQCX-JJ8-kVyUns8M7o', dest_path='./data/nlg_gru.zip', unzip=True)

        files = ["train_data", "val_data", "test_data"]
        for file in files:
            orig_file = os.path.join("data","new_small_data",str(file+".json"))
            users_list, num_samples, user_data = reduce_users(orig_file)
            
            # Preprocess data
            if exp == "nlg_gru":
                os.makedirs("data/nlg_gru", exist_ok= True) if platform.system() == "Windows" else os.system("mkdir data/nlg_gru") 
                for users in user_data:
                    listToStr = ''
                    for i, sentences in enumerate(user_data[users]['x']):
                        for j, pieces in enumerate(sentences):
                            listToStr = ' '.join([elem for elem in pieces])
                            user_data[users]['x'][i][j] = listToStr
                        full_sentence = ' '.join([elem for elem in sentences])
                        full_sentence = full_sentence.replace('<PAD>', '').replace('<EOS>', '').replace('<BOS>', '').strip()
                        user_data[users]['x'][i] = full_sentence
                        user_data[users].pop('y',None)

            elif exp == "mlm_bert":
                os.makedirs("data/mlm_bert", exist_ok= True) if platform.system() == "Windows" else os.system("mkdir data/mlm_bert")
                user_data_aux = dict()
                for users in user_data:
                    listToStr = ''
                    for i, sentences in enumerate(user_data[users]['x']):
                        for j, pieces in enumerate(sentences):
                            listToStr = ' '.join([elem for elem in pieces])
                            listToStr = listToStr.replace('<PAD>', '').replace('<EOS>', '').replace('<BOS>', '').strip()
                            user_data[users]['x'][i][j] = listToStr
                        user_data[users].pop('y',None)
                    user_data_aux[users] = user_data[users]['x']
                user_data = user_data_aux

            # Create new dictionary
            new_dict = {'users':users_list ,'num_samples':num_samples, 'user_data':user_data}

            # Save preprocessed files
            ext = ".json" if exp=="nlg_gru" else ".txt"
            new_file = os.path.join("data",exp,str(file+ ext))
            f = open(new_file,'w')
            json.dump(new_dict,f)
            f.close()

            # Build vocabulary
            os.system(str("python build_vocab.py --data-dir ./data/"+ exp +" --target-dir ./models"))
            
    elif exp == "classif_cnn":
        os.makedirs("data/classif_cnn", exist_ok= True) if platform.system() == "Windows" else os.system("mkdir data/classif_cnn")
        
        # Get training and testing sets from torchvision
        transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
        trainset = torchvision.datasets.CIFAR10(root='./data', train=True,download=True, transform=transform)
        testset = torchvision.datasets.CIFAR10(root='./data', train=False,download=True, transform=transform)

        # Saving datasets
        _process_and_save_to_disk(trainset, n_users=50, exp=exp, output='./data/classif_cnn/train_data')
        _process_and_save_to_disk(testset, n_users=50, exp=exp, output='./data/classif_cnn/test_data')
    
    elif exp == "ecg_cnn":
        os.makedirs("data/ecg_cnn", exist_ok= True) if platform.system() == "Windows" else os.system("mkdir data/ecg_cnn")
        
        # Create dummy datasets
        for set in ['train_data.csv', 'test_data.csv']:
            data= [random.random() for i in range(188)]
            with open(os.path.join('data',exp,set), 'w', newline='') as f:
                write = csv.writer(f)
                for row in range(87554):
                    write.writerow(data)

        # Preprocess datasets
        for set in ['train_data', 'test_data']: 
            with open(os.path.join('data',exp,str(set+".csv"))) as f: 
                testset = list(csv.reader(f , delimiter=','))
            TestDataset = HeartDataSet(testset, 21000)
            _process_and_save_to_disk(TestDataset,1000,exp,os.path.join('data',exp,set))

if __name__ == '__main__':
    main()


================================================
FILE: testing/hello_world_classif_cnn.yaml
================================================
# Basic configuration file for running classif_cnn example using hdf5 files.
# Parameters needed to initialize the model
model_config:
    model_type: CNN                                    # class w/ `loss` and `inference` methods
    model_folder: experiments/classif_cnn/model.py     # file containing class

# Configuration for differential privacy
dp_config:
    enable_local_dp: false                             # whether to enable user-level DP

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false                               # cache data to compute additional metrics

# Select the Federated optimizer to use (e.g. DGA or FedAvg)
strategy: DGA

# Determines all the server-side settings for training and evaluation rounds
server_config:   
    wantRL: false                                      # whether to use RL-based meta-optimizers
    resume_from_checkpoint: false                      # restart from checkpoint if file exists
    do_profiling: false                                # run profiler and compute runtime metrics
    optimizer_config:                                  # this is the optimizer used to update the model
        type: sgd
        lr: 1.0
    annealing_config:                                  # annealer for the learning rate
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 1                                       # how many iterations between metric eval on val set
    rec_freq: 5                                     # how many iterations between metric eval on test set
    initial_val: true
    initial_rec: true
    max_iteration: 3                                # how many iterations in total
    num_clients_per_iteration: 3                      # how many clients per iteration
    data_config:                                       # where to get val and test data from
        val:
            batch_size: 10000
            val_data: null
        test:
            batch_size: 10000
            test_data: null
    type: model_optimization
    aggregate_median: softmax                          # how aggregations weights are computed
    initial_lr_client: 0.001                           # learning rate used on client optimizer
    lr_decay_factor: 1.0
    weight_train_loss: train_loss
    best_model_criterion: f1_score
    fall_back_to_best_model: false
    softmax_beta: 1.0

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    do_profiling: false                                # run profiling and compute runtime metrics
    ignore_subtask: false
    data_config:                                       # where to get training data from
        train:
            batch_size: 4
            list_of_train_data: null
            desired_max_samples: 50000
    optimizer_config:                                  # this is the optimizer used by the client
        type: sgd
        lr: 0.001                                      # this is overridden by `initial_lr_client`
        momentum: 0.9
    type: optimization

================================================
FILE: testing/hello_world_ecg_cnn.yaml
================================================
# Basic configuration file for running ecg_cnn example using json files.
# Parameters needed to initialize the model
model_config:
    model_type: SuperNet                               # class w/ `loss` and `inference` methods
    model_folder: experiments/ecg_cnn/model.py         # file containing class

# Configuration for differential privacy
dp_config:
    enable_local_dp: false                             # whether to enable user-level DP

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false                               # cache data to compute additional metrics

# Select the Federated optimizer to use (e.g. DGA or FedAvg)
strategy: DGA

# Determines all the server-side settings for training and evaluation rounds
server_config:   
    wantRL: false                                      # whether to use RL-based meta-optimizers
    resume_from_checkpoint: false                      # restart from checkpoint if file exists
    do_profiling: false                                # run profiler and compute runtime metrics
    optimizer_config:                                  # this is the optimizer used to update the model
        type: sgd
        lr: 1.0
    annealing_config:                                  # annealer for the learning rate
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 5                                      # how many iterations between metric eval on val set
    rec_freq: 3                                      # how many iterations between metric eval on test set
    initial_val: false
    initial_rec: false
    max_iteration: 3                               # how many iterations in total
    num_clients_per_iteration: 3                      # how many clients per iteration
    data_config:                                       # where to get val and test data from
        val:
            batch_size: 10000
            val_data: data/ecg_cnn/test_data.hdf5
        test:
            batch_size: 10000
            test_data: data/ecg_cnn/test_data.hdf5
    type: model_optimization
    aggregate_median: softmax                          # how aggregations weights are computed
    softmax_beta: 20.0
    initial_lr_client: 0.001                           # learning rate used on client optimizer
    lr_decay_factor: 1.0
    weight_train_loss: train_loss
    best_model_criterion: loss
    fall_back_to_best_model: false

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    do_profiling: false                                # run profiling and compute runtime metrics
    ignore_subtask: false
    data_config:                                       # where to get training data from
        train:
            batch_size: 96
            list_of_train_data: data/ecg_cnn/train_data.hdf5
            desired_max_samples: 87000
    optimizer_config:                                  # this is the optimizer used by the client
        type: sgd 
        lr: 0.001                                      # this is overridden by `initial_lr_client`
        momentum: 0.90
    type: optimization

================================================
FILE: testing/hello_world_mlm_bert.yaml
================================================
# Basic configuration file for running mlm_bert example using json files.
# Parameters needed to initialize the model
model_config:
    model_type: BERT 
    model_folder: experiments/mlm_bert/model.py
    BERT:
        loader_type: text
        model:
            model_name: roberta-large
            cache_dir: ./cache_dir
            use_fast_tokenizer: False
            mask_token: <mask>
            task: mlm
            past_index: -1
            prediction_loss_only: false
            process_line_by_line: false
        training:
            seed: 12345
            label_smoothing_factor: 0  
            batch_size: 64
            max_seq_length: 256            

# Configuration for differential privacy
dp_config:
    enable_local_dp: false  # If enabled, the rest of parameters is needed. 
    enable_global_dp: false # Local dp clips and adds noise on the client and centrally accumulates the privacy budget
    eps: 100                # epsilon
    global_sigma: 0.35      # Used when global dp es enabled, specifies the global Gaussian noise
    weight_scaler: 0.0001   # indicates how the aggregation weights scaled before noise addition, and unscaled afterwards.
    max_grad: 0.008         # max gradient
    max_weight: 0.5         # The max_weight and min_weight should be already scaled by weight_scaler
    min_weight: 0.0000001   # Because we scale down the weight using weight_scalar -> clip -> add noise -> scale back up.

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false    # If enabled, the rest of parameters is needed. 

# Select the Federated optimizer to use (e.g. DGA or FedAvg)
strategy: DGA

# Determines all the server-side settings for training and evaluation rounds
server_config:
    resume_from_checkpoint: true                    # Resumes from latest checkpoint iteration if available 
    do_profiling: false                             # Capture profiling information during server updates.
    wantRL: false                                   # Enable/Disable Reinforcement learning
    optimizer_config:                               # Configuration for server-side optimizer
        lr: 0.00001                                 
        weight_decay: 0.01
        type: adamW
    annealing_config:                               # This section configures how the learning rate decays
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 1000
    val_freq: 5                                     # Frequency for validation rounds
    rec_freq: 5                                    # Frequency for testing rounds
    initial_val : false                              # Enable initial validation round at itr=0
    initial_rec: false                              # Enable initial testing round at itr=0
    max_iteration: 2                            # Total number of rounds for FL
    num_clients_per_iteration: 2                  # Number of clients sampled per round
    data_config:                                    # Server-side data configuration
        val:                                        # Validation data
            val_data: data/mlm_bert/val_data.txt
            task: mlm
            mlm_probability: 0.25
            tokenizer_type_fast: False
            batch_size: 128
            max_seq_length: 256
            min_words_per_utt: 5
            max_samples_per_user: 5000
            mask_token: <mask>
            num_workers: 0
            prepend_datapath: false
            cache_dir: ./cache_dir
        # Note this is NOT the main training data configuration, which is configured in the 
        # client config.  This section is ignored unless you are running replay data.
        # If you want to run replay data- set a path name for train_data_server.
        # train:
        #     loader_type: text
        #     train_data: null
        #     train_data_server: null
        #     desired_max_samples: null
        test:                                       # Test data configuration
            test_data: data/mlm_bert/test_data.txt
            task: mlm
            mlm_probability: 0.25
            tokenizer_type_fast: False
            batch_size: 128
            max_seq_length: 256
            max_samples_per_user: 5000
            mask_token: <mask>
            num_workers: 0
            prepend_datapath: false
            cache_dir: ./cache_dir
    type: model_optimization                        # Server type
    aggregate_median: softmax                       # FL aggregation method
    weight_train_loss: train_loss                # Determines how each client's weight is computed (e.g. grad_mean_loss, train_loss)
    softmax_beta: 1.00                              
    initial_lr_client: 0.00001
    lr_decay_factor: 1.0
    best_model_criterion: loss                      # Determine the best model based on minimal loss, for checkpointing
    fall_back_to_best_model: false                  # If a model degrades, use the previous best model

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    meta_learning: basic
    stats_on_smooth_grad: true
    ignore_subtask: false
    copying_train_data: false
    do_profiling: false                             # Enables client-side training profiling
    data_config:
        train:                                      # This is the main training data configuration
            list_of_train_data: data/mlm_bert/train_data.txt
            task: mlm
            mlm_probability: 0.25
            tokenizer_type_fast: False
            batch_size: 24
            max_seq_length: 256
            min_words_per_utt: 5
            desired_max_samples: 5000
            mask_token: <mask>
            num_workers: 0
            num_frames: 0
            max_grad_norm: 15.0
            prepend_datapath: false
            cache_dir: ./cache_dir
            pin_memory: true
    type: optimization
    meta_optimizer_config:
        lr: 0.01
        type: adam
    optimizer_config:
        type: adamW
        weight_decay: 0.01
        amsgrad: true
    annealing_config:
        type: step_lr
        step_interval: epoch
        step_size: 2
        gamma: 1.0

================================================
FILE: testing/hello_world_nlg_gru.yaml
================================================
# Basic configuration file for running nlg_gru example using json files.
# Parameters needed to initialize the model
model_config: 
    model_type: GRU
    model_folder: experiments/nlg_gru/model.py
    embed_dim: 160
    vocab_size: 10000
    hidden_dim: 512
    OOV_correct: false

# Configuration for differential privacy
dp_config:
    enable_local_dp: false      # If enabled, the rest of parameters is needed. 

# Additional privacy metrics
privacy_metrics_config:
    apply_metrics: false             # If enabled, the rest of parameters is needed. 

# Select the Federated optimizer to use (e.g. DGA or FedAvg)
strategy: DGA

# Determines all the server-side settings for training and evaluation rounds
server_config:   
    wantRL: false                   # Enable/Disable Reinforcement learning
    resume_from_checkpoint: true    # Resumes from latest checkpoint iteration if available 
    do_profiling: false             # Capture profiling information during server updates.
    optimizer_config:               # Configuration for server-side optimizer
        type: adam
        lr: 0.003
        amsgrad: true
    annealing_config:               # This section configures how the learning rate decays
        type: step_lr
        step_interval: epoch
        gamma: 1.0
        step_size: 100
    val_freq: 1                     # Frequency for validation rounds
    rec_freq: 5                     # Frequency for testing rounds
    initial_val : true            # Enable initial validation round at itr=0
    initial_rec: false             # Enable initial testing round at itr=0
    max_iteration: 3               # Total number of rounds for FL
    num_clients_per_iteration: 10   # Number of clients sampled per round
    data_config:                    # Server-side data configuration
        val:                        # Validation data
            # batch_size: 2048
            tokenizer_type: not_applicable
            prepend_datapath: false
            val_data: data/nlg_gru/val_data.json
            vocab_dict: models/vocab_reddit.vocab
            pin_memory: true
            num_workers: 0                          # Indicates how many workers are used for creating batches
            num_frames: 2400                        
            max_batch_size: 2048
            max_num_words:  25
            unsorted_batch: true
        test:                                       # Test data configuration
            batch_size: 2048
            tokenizer_type: not_applicable
            prepend_datapath: false
            train_data: null
            train_data_server: null
            test_data: data/nlg_gru/test_data.json
            vocab_dict: models/vocab_reddit.vocab
            pin_memory: true
            num_workers: 0                          # Indicates how many workers are used for creating batches
            max_batch_size: 2048
            max_num_words:  25
            unsorted_batch: true
    type: model_optimization
    aggregate_median: softmax                       # FL aggregation method
    weight_train_loss: train_loss                   # Determines how each client's weight is computed (e.g. grad_mean_loss, train_loss)
    softmax_beta: 20.0
    initial_lr_client: 1.0
    lr_decay_factor: 1.0
    best_model_criterion: loss                      # Determine the best model based on minimal loss, for checkpointing
    fall_back_to_best_model: false                  # If a model degrades, use the previous best model

# Dictates the learning parameters for client-side model updates. Train data is defined inside this config.
client_config:
    meta_learning: basic
    stats_on_smooth_grad: true
    ignore_subtask: false
    num_skips_threshold: 10
    copying_train_data: false
    do_profiling: false                                 # Enables client-side training profiling
    data_config:
        train:                                          # This is the main training data configuration
            batch_size: 64
            tokenizer_type: not_applicable
            prepend_datapath: false
            list_of_train_data: data/nlg_gru/train_data.json
            vocab_dict: models/vocab_reddit.vocab
            pin_memory: true
            num_workers: 0
            desired_max_samples: 50000
            max_grad_norm: 20.0
            max_batch_size: 128
            max_num_words:  25
            unsorted_batch: true
    type: optimization
    meta_optimizer_config:
        lr: 1.0
        type: sgd
    optimizer_config:
        type: sgd
    annealing_config:
        type: step_lr
        step_interval: epoch
        step_size: 1
        gamma: 1.0

================================================
FILE: testing/test_e2e_trainer.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import subprocess
import os
import platform
import pytest 

xfail = pytest.mark.xfail

def get_info(task):

    data_path=r'./testing/'
    output_path=r'./testing/outputs/'

    if task == 'nlg_gru':
        config_path=r'./testing/hello_world_nlg_gru.yaml'
    elif task == "classif_cnn":
        config_path=r'./testing/hello_world_classif_cnn.yaml'
    elif task == "ecg_cnn":
        config_path=r'./testing/hello_world_ecg_cnn.yaml'
    elif task == "mlm_bert":
        config_path=r'./testing/hello_world_mlm_bert.yaml'

    return data_path, output_path, config_path

def run_pipeline(data_path, output_path, config_path, task):

    print("Testing {} task".format(task))

    # Adjust command to the task and OS
    sym = "&" if platform.system() == "Windows" else ";" 
    command = 'cd .. '+ sym +' python '+'-m '+'torch.distributed.run '+ '--nproc_per_node=2 '+ 'e2e_trainer.py '+ \
            '-dataPath '+ data_path+' -outputPath '+output_path+' -config ' +config_path +\
            ' -task '+ task + ' -backend '+ 'nccl'

    # Execute e2e_trainer + stores the exit code
    with open('logs.txt','w') as f:                      
        process= subprocess.run(command, shell=True,stdout=f,text=True,timeout=900)
    return_code=process.returncode
    
    # Print logs
    os.system("ls")
    os.system("less logs.txt")
    print(process.stderr)
    print("Finished running {} task".format(task))

    return return_code

def test_nlg_gru():  
    
    task = 'nlg_gru'
    data_path, output_path, config_path = get_info(task)
    assert run_pipeline(data_path, output_path, config_path, task)==0

def test_ecg_cnn():  
    
    task = 'ecg_cnn'
    data_path, output_path, config_path = get_info(task)
    assert run_pipeline(data_path, output_path, config_path, task)==0
    
@pytest.mark.xfail
def test_mlm_bert():  
    
    task = 'mlm_bert'
    data_path, output_path, config_path = get_info(task)
    assert run_pipeline(data_path, output_path, config_path, task)==0
    print("PASSED")

@pytest.mark.xfail
def test_classif_cnn():  
    
    task = 'classif_cnn'
    data_path, output_path, config_path = get_info(task)
    assert run_pipeline(data_path, output_path, config_path, task)==0
    print("PASSED")


================================================
FILE: utils/__init__.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from .utils import *
from utils.optimizers.lars import *
from utils.optimizers.lamb import *


================================================
FILE: utils/data_utils.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import random
import logging
from torch.utils.data import sampler
from utils import AverageMeter

class BatchSampler(sampler.Sampler):
    """
    Simply determines the order in which the loader will read samples from the data set.
    We want to sample batches randomly, but each batch should have samples that are
    close to each other in the dataset (so that we don't have a lot of zero padding)
    """

    def __init__(self, dataset, batch_size, randomize=True, drop_last=False):
        self.dataset = dataset
        self.batch_size = batch_size
        self.randomize=randomize

        batches = [range(begin_id, begin_id + batch_size) for begin_id in range(0, len(dataset), batch_size)]

        # if the indexes in the last batch are going over len(dataset), we drop the last batch.
        if batches[-1][-1] > len(dataset):
            if drop_last:
                del batches[-1]
            else:
                batches[-1]=range(batches[-1][0],len(dataset))
        self.batches = batches

    def __iter__(self):

        if self.randomize:
            random.shuffle(self.batches)

        return iter(self.batches)

    def __len__(self):
        return len(self.batches) * self.batch_size


class DynamicBatchSampler(sampler.Sampler):
    """Extension of Sampler that will do the following:
        1.  Change the batch size (essentially number of sequences)
            in a batch to ensure that the total number of frames are less
            than a certain threshold.
        2.  Make sure the padding efficiency in the batch is high.
    """

    def __init__(self, sampler, frames_threshold, max_batch_size=0, unsorted_batch=False, fps= 1000 / 30):
        """
        @sampler: will mostly be an instance of DistributedSampler.
        Though it should work with any sampler.
        @frames_threshold: maximum area of the batch
        """
        self.sampler = sampler
        self.frames_threshold = frames_threshold
        self.max_batch_size = max_batch_size
        self.unsorted_batch = unsorted_batch

        indices, batches = list(), list()
        # the dataset to which these indices are pointing to
        dataset = self.sampler.dataset
        # get all the indices and corresponding durations from
        # the sampler
        for idx in self.sampler:
            indices.append((idx, dataset.utt_list[idx]["duration"]))

        # sort the indices according to duration
        if self.unsorted_batch is False:
            indices.sort(key=lambda elem : elem[1])
            max_dur = indices[-1][1]
        else:
            # make sure that you will be able to serve all the utterances
            max_dur = max([indices[i][1] for i in range(len(indices))])

        # start clubbing the utterances together
        batch = list()
        batch_frames, batch_area = 0, 0
        max_frames_in_batch = 0
        average_meter = AverageMeter('Padding Efficiency')
        for idx, duration in indices:
            if duration > 0:
                frames = duration * fps
                if frames > max_frames_in_batch:
                    max_frames_in_batch = frames

                if (self.unsorted_batch and len(batch) < max_batch_size)\
                    or (not self.unsorted_batch and batch_frames + frames <= self.frames_threshold and (max_batch_size == 0 or len(batch) < max_batch_size)):
                    batch.append(idx)
                    batch_frames += frames
                    batch_area = max_frames_in_batch * len(batch)
                else:
                    # log the stats and add previous batch to batches
                    if batch_area > 0 and len(batch) > 0:
                        average_meter.add(batch_frames, batch_area)
                        batches.append(batch)
                    # make a new one
                    batch = list()
                    batch_frames, batch_area = frames, frames
                    max_frames_in_batch = batch_frames

        # When all indices are processed
        if batch_area > 0 and len(batch) > 0:
            average_meter.add(batch_frames, batch_area)
            batches.append(batch)

        # don't need the 'indices' any more
        del indices
        self.batches = batches
        average_meter.display_results(loglevel=logging.DEBUG)

    def __iter__(self):
        # shuffle on a batch level
        random.shuffle(self.batches)
        return iter(self.batches)

    def __len__(self):
        return len(self.batches)


================================================
FILE: utils/dataloaders_utils.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import os
import logging
from importlib.machinery import SourceFileLoader
from utils import print_rank

def get_exp_dataloader(task):
    """ Detect the dataloader declared in the experiment folder

    Args:
        task (str): task parsed from the console
    """
    
    try:
        dir = os.path.join('experiments',task,'dataloaders','dataloader.py')
        loader = SourceFileLoader("DataLoader",dir).load_module()
        loader = loader.DataLoader
    except:
        print_rank("Dataloader not found, please make sure is located inside the experiment folder")

    return loader

def make_train_dataloader(data_config, data_path, clientx, task=None, vec_size=300, data_strct=None, replay_server=False):
    """ Create a dataloader for training on either server or client side """

    mode = 'train'
    tokenizer_type= data_config.get('tokenizer_type', 'not_applicable')

    # Training list for a server
    if clientx is None:  
        if not "train_data_server" in data_config or data_config["train_data_server"] is None:
            print_rank("No server training set is defined")
            return None
        my_data = os.path.join(data_path, data_config["train_data_server"])
        mode='val' # Only for replay_server
        clientx = 0 # Only for replay_server
        
    # Training list on a client side
    else:  
        if tokenizer_type != 'not_applicable':
            assert clientx >=0 and clientx < len(data_config["train_data"]), "Invalid client index {}".format(clientx)
            my_data = data_config["train_data"][clientx]
        else:
            my_data = data_config["list_of_train_data"]

    DataLoader = get_exp_dataloader(task)
    train_dataloader = DataLoader(data = data_strct if data_strct is not None else my_data,
                                    user_idx = clientx,
                                    mode = mode,
                                    args=data_config
                                    )

    return train_dataloader


def make_val_dataloader(data_config, data_path, task=None, data_strct=None, train_mode=False):
    """ Return a data loader for a validation set """

    DataLoader = get_exp_dataloader(task)
    val_file = os.path.join(data_path, data_config["val_data"]) if data_config["val_data"] != None and data_path != None else None
    val_dataloader = DataLoader(data = data_strct if data_strct is not None else val_file,
                                user_idx = 0,
                                mode = 'val',
                                args=data_config
                                )

    return val_dataloader


def make_test_dataloader(data_config, data_path, task=None, data_strct=None):
    """ Return a data loader for an evaluation set. """

    DataLoader = get_exp_dataloader(task)
    test_file = os.path.join(data_path, data_config["test_data"]) if data_config["test_data"] != None and data_path != None else None
    test_dataloader = DataLoader(data = data_strct if data_strct is not None else test_file,
                                user_idx = 0,
                                mode = 'test',
                                args=data_config
                                )

    return test_dataloader

def get_dataset(data_path, config, task, mode, test_only=False, user_idx=-1, data_strct=None):
    """ Return the task train/val/test dataset """

    # Load Dataset Class
    data_config = get_data_config(config,mode)
    dir_ = os.path.join('experiments',task,'dataloaders','dataset.py')
    loader = SourceFileLoader("Dataset",dir_).load_module()
    dataset = loader.Dataset

    data_file = "val_data" if mode == "val" else "test_data" if mode == "test" else "list_of_train_data"
    data_file = data_config[data_file]
    data_pointer = os.path.join(data_path, data_file) if data_file != None else data_file

    return dataset(data_pointer if data_strct == None else data_strct, test_only=test_only, user_idx=user_idx, args=data_config)

def get_data_config(config, mode):
    """ Return the configuration for the dataset"""

    if mode == 'val':
        data_config = config['server_config']['data_config']["val"]
    elif mode == 'test':
        data_config = config['server_config']['data_config']["test"]
    else:
        data_config = config["client_config"]["data_config"]["train"]
    
    semisupervision_config = config["client_config"].get('semisupervision',None)
    if semisupervision_config == None:
        return data_config
    else:
        return {** data_config, **semisupervision_config}


================================================
FILE: utils/optimizers/adamW.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import math
import torch
from torch.optim import Optimizer

class AdamW(Optimizer):
    """ Implements Adam algorithm with weight decay fix.
    Parameters:
        lr (float): learning rate. Default 1e-3.
        betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
        eps (float): Adams epsilon. Default: 1e-6
        weight_decay (float): Weight decay. Default: 0.0
        correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
    """
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
        if not 0.0 <= betas[1]  < 1.0:
            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
                        correct_bias=correct_bias)
        super(AdamW, self).__init__(params, defaults)

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # Decay the first and second moment running average coefficient
                # In-place operations to update the averages at the same time
                exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
                denom = exp_avg_sq.sqrt().add_(group['eps'])

                step_size = group['lr']
                if group['correct_bias']:  # No bias correction for Bert
                    bias_correction1 = 1.0 - beta1 ** state['step']
                    bias_correction2 = 1.0 - beta2 ** state['step']
                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1

                p.data.addcdiv_(exp_avg, denom, value = -step_size)

                # Just adding the square of the weights to the loss function is *not*
                # the correct way of using L2 regularization/weight decay with Adam,
                # since that will interact with the m and v parameters in strange ways.
                #
                # Instead we want to decay the weights in a manner that doesn't interact
                # with the m/v parameters. This is equivalent to adding the square
                # of the weights to the loss with plain (non-momentum) SGD.
                # Add weight decay at the end (fixed version)
                if group['weight_decay'] > 0.0:
                    p.data.add_(p.data, alpha= -group['lr'] * group['weight_decay'])

        return loss

================================================
FILE: utils/optimizers/lamb.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Lamb optimizer."""

import collections
import math

import torch
from torch.optim import Optimizer

try:
    from tensorboardX import SummaryWriter

    def log_lamb_rs(optimizer: Optimizer, event_writer: SummaryWriter, token_count: int):
        """Log a histogram of trust ratio scalars in across layers."""
        results = collections.defaultdict(list)
        for group in optimizer.param_groups:
            for p in group['params']:
                state = optimizer.state[p]
                for i in ('weight_norm', 'adam_norm', 'trust_ratio'):
                    if i in state:
                        results[i].append(state[i])

        for k, v in results.items():
            event_writer.add_histogram(f'lamb/{k}', torch.tensor(v), token_count)

except ImportError:
    def log_lamb_rs(optimizer, event_writer, token_count):
        print("tensorboardX is not installed")


class LAMB(Optimizer):
    r"""Implements Lamb algorithm.

    It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.

    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        adam (bool, optional): always use trust ratio = 1, which turns this into
            Adam. Useful for comparison purposes.

    .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes:
        https://arxiv.org/abs/1904.00962
    """

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
                 weight_decay=0, adam=False):
        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
        defaults = dict(lr=lr, betas=betas, eps=eps,
                        weight_decay=weight_decay)
        self.adam = adam
        super(LAMB, self).__init__(params, defaults)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = torch.zeros_like(p.data)
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # Decay the first and second moment running average coefficient
                # m_t
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                # v_t
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                # Paper v3 does not use debiasing.
                # bias_correction1 = 1 - beta1 ** state['step']
                # bias_correction2 = 1 - beta2 ** state['step']
                # Apply bias to lr to avoid broadcast.
                step_size = group['lr'] # * math.sqrt(bias_correction2) / bias_correction1

                weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)

                adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps'])
                if group['weight_decay'] != 0:
                    adam_step.add_(p.data, alpha=group['weight_decay'])

                adam_norm = adam_step.pow(2).sum().sqrt()
                if weight_norm == 0 or adam_norm == 0:
                    trust_ratio = 1
                else:
                    trust_ratio = weight_norm / adam_norm
                state['weight_norm'] = weight_norm
                state['adam_norm'] = adam_norm
                state['trust_ratio'] = trust_ratio
                if self.adam:
                    trust_ratio = 1

                p.data.add_(adam_step, alpha=-step_size * trust_ratio)

        return loss


================================================
FILE: utils/optimizers/lars.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""distoptim.hit package"""
import logging
import torch

LOG = logging.getLogger(__name__)

class LarsSGDV1(torch.optim.SGD):
    """ LARS SGD V1, based on https://arxiv.org/abs/1708.03888
        2018.
        Refer to torch.optim.SGD for paramters.
    """

    def __init__(self, params, lr, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        LOG.info("Init LarsSGDV1")
        super(LarsSGDV1, self).__init__(
            params, lr, momentum, dampening, weight_decay, nesterov)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            # dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue

                d_p = p.grad.data

                p_n = p.data.norm()
                d_p_n = d_p.norm()

                if weight_decay != 0:
                    d_p_n.add_(weight_decay, p_n)
                    d_p.add_(weight_decay, p.data)

                alpha = 0.001 * p_n / d_p_n  # This is the LARS eta from the paper
                lr = alpha * group['lr']
                lr = min(lr, 5.0) 

                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = \
                            torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(lr, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-1, d_p)

        return loss


class LarsSGD(torch.optim.SGD):
    """ LARS SGD, based on https://arxiv.org/abs/1904.00962 Algorithm 1
        2019, a newer version.
        Refer to torch.optim.SGD for paramters.
    """

    def __init__(self, params, lr, momentum=0, dampening=0,
                 weight_decay=0, nesterov=False):
        LOG.info("Init LarsSGD")
        super(LarsSGD, self).__init__(
            params, lr, momentum, dampening, weight_decay, nesterov)

    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            # dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue

                d_p = p.grad.data
                if weight_decay != 0:
                    d_p.add(p.data, alpha=weight_decay)

                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = \
                            torch.clone(d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - momentum, d_p)
                    if nesterov:
                        d_p = d_p.add(buf, alpha=momentum)
                    else:
                        d_p = buf

                lr = group['lr'] * p.data.norm() / (d_p.norm() + 1e-8)
                lr.clamp_(0, 10)
                p.data.add_(d_p, alpha=-lr)

        return loss


================================================
FILE: utils/preprocessing/create-hdf5.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import h5py
import time
from tqdm import tqdm
import pandas as pd


path = r'C:\Users\train.tsv'

def local_time():
    return str(time.strftime("%H:%M:%S",time.localtime()))


print(local_time() + " Starting script " )    
columns = ['author','num1','content','str1','str2','num2','subreddit']
df = pd.read_csv(path, sep='\t', names=columns, header=None)
print(local_time() + " File has been read "  )

df_authors = pd.DataFrame(df['author'])
df_content = pd.DataFrame(df['content'])
df_file = pd.concat([df_authors,df_content], axis=1)
print(local_time() + " Data needed has been concatenated ")


users_group = df_file.groupby('author')
group0 = df_file.groupby(['author','content'])
group1 = pd.Series(users_group.size())
users = (group1.index).to_numpy() 
print(local_time() + " users been formatted ")
num_samples = group1.values 
print(local_time() + " num_samples has been formatted ")
user_data_dict= {}

user_data_dict= {i: {'x':list()} for i in tqdm(users)}

for i in tqdm(range(len(df_file))):
    if df_file['content'][i] not in user_data_dict[df_file['author'][i]]['x']:
        user_data_dict[df_file['author'][i]]['x'].append(df_file['content'][i])
        

print(local_time() + " user_data has been formatted ")
f = h5py.File(r"C:\Users\train.hdf5", "w")
dset_0 = f.create_dataset("num_samples",data=num_samples)
dset_1= f.create_dataset("users", data =users)
print(local_time() + " starting to store dictionary ")

user_data = f.create_group("user_data")
for user in tqdm(user_data_dict):
    user_group = user_data.create_group(user)
    user_data_dict[user]['x'] = [str(e).encode('utf8') for e in  user_data_dict[user]['x']]
    x_dset = user_group.create_dataset('x',data=user_data_dict[user]['x'])

print(local_time() + " end of script ")

================================================
FILE: utils/preprocessing/create-json.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import json
import time
from tqdm import tqdm
import pandas as pd

path = r'C:\Users\train.tsv'

def local_time():
    return str(time.strftime("%H:%M:%S",time.localtime()))


print(local_time() + " Starting script " )    
columns = ['author','num1','content','str1','str2','num2','subreddit']
df = pd.read_csv(path, sep='\t', names=columns, header=None)
print(local_time() + " File has been read "  )

df_authors = pd.DataFrame(df['author'])
df_content = pd.DataFrame(df['content'])
df_file = pd.concat([df_authors,df_content], axis=1)
print(local_time() + " Data needed has been concatenated ")


users_group = df_file.groupby('author')
group0 = df_file.groupby(['author','content'])
group1 = pd.Series(users_group.size())
users = (group1.index).to_numpy() 
print(local_time() + " users been formatted ")
num_samples = group1.values 
print(local_time() + " num_samples has been formatted ")
user_data_dict= {}

user_data_dict= {i: {'x':list()} for i in tqdm(users)}

for i in tqdm(range(len(df_file))):
    if df_file['content'][i] not in user_data_dict[df_file['author'][i]]['x']:
        user_data_dict[df_file['author'][i]]['x'].append(df_file['content'][i])
        

f = open(r'C:\Users\train.json', "w")
new_data = {'users': users.tolist(), 'num_samples': num_samples.tolist(), 'user_data': user_data_dict}
json.dump(new_data,f)
print(local_time() + " end of script ")

================================================
FILE: utils/preprocessing/from_json_to_hdf5.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import json
import h5py
from tqdm import tqdm
import time

json_file = r'C:\Users\train.tsv'

def local_time():
    return str(time.strftime("%H:%M:%S",time.localtime()))

print(local_time() + " Starting script " )   
with open(json_file, 'r') as f:
    json_file = json.load(f)
print(local_time() + " JSON file read " )   

hdf_file = h5py.File(r"C:\Users\train.hdf5", "w")
dset_0 = hdf_file.create_dataset("users",data=json_file['users'])
dset_1 = hdf_file.create_dataset("num_samples",data=json_file['num_samples'])
print(local_time() + " users and num_samples stored " )   

user_data = hdf_file.create_group("user_data")
for user in tqdm(json_file['user_data']):
    user_group = user_data.create_group(user)
    dset_2 = user_group.create_dataset('x',data=json_file['user_data'][user]['x'])

print(local_time() + " end of script " )   

================================================
FILE: utils/utils.py
================================================
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import os
import sys
import numpy as np
import logging
import yaml
import time
import math
import json
import copy
import io
import pstats
import functools
import torch
from collections import OrderedDict
from utils.optimizers.lars import LarsSGD
from utils.optimizers.lamb import LAMB
from utils.optimizers.adamW import AdamW
from easydict import EasyDict as edict
from torch.optim.lr_scheduler import (
    StepLR, 
    MultiStepLR, 
    ReduceLROnPlateau )

def make_optimizer(optimizer_config, model):
    """Initialization for optimizer."""

    tmp_config = copy.deepcopy(optimizer_config)
    if optimizer_config["type"] == "sgd":
        tmp_config.pop("type", None)
        return torch.optim.SGD(model.parameters(), **tmp_config)

    elif optimizer_config["type"] == "adam":
        tmp_config.pop("type", None)
        return torch.optim.Adam(model.parameters(), **tmp_config)

    elif optimizer_config["type"] == "adamax":
        tmp_config.pop("type", None)
        tmp_config.pop("amsgrad", None)
        return torch.optim.Adamax(model.parameters(), **tmp_config)

    elif optimizer_config["type"] == "lars":
        tmp_config.pop("type", None)
        from torchlars import LARS
        base_optimizer = torch.optim.SGD(model.parameters(), **tmp_config)
        return LARS(optimizer=base_optimizer, eps=1e-8, trust_coef=0.001)
    
    elif optimizer_config["type"] == "LarsSGD":
        tmp_config.pop("type", None)
        return LarsSGD(model.parameters(),**tmp_config)

    elif optimizer_config["type"] == "lamb":
        tmp_config.pop("type", None)
        return LAMB(model.parameters(), **tmp_config)

    elif optimizer_config["type"] == "adamW":
        tmp_config.pop("type", None)
        tmp_config.pop("amsgrad", None)
        return AdamW(model.parameters(), **tmp_config)
        
    else:
        raise ValueError("{} optimizer not supported".format(optimizer_config["type"]))


def get_lr(optimizer):
    """Obtain LR."""
    for param_group in optimizer.param_groups:
        return param_group['lr']

def get_lr_all(optimizer):
    """Double checking for get_lr."""
    for param_group in optimizer.param_groups:
        yield param_group['lr']


def softmax(X, theta = 1.0, axis = None):
    """Compute the softmax of each element along an axis of X.

    Args:
        X (ndarray): x, probably should be floats.
        theta (float): used as a multiplier prior to exponentiation. Default = 1.0
        axis : axis to compute values along. Default is the first non-singleton axis.

    Returns:
        An array the same size as X. The result will sum to 1 along the specified axis.
    """
    # make X at least 2d
    y = np.atleast_2d(X)

    # find axis
    if axis is None:
        axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

    # multiply y against the theta parameter,
    y = y * float(theta)

    # subtract the max for numerical stability
    y = y - np.expand_dims(np.max(y, axis = axis), axis)

    # exponentiate y
    y = np.exp(y)

    # take the sum along the specified axis
    ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)

    # finally: divide elementwise
    p = y / ax_sum

    # flatten if X was 1D
    if len(X.shape) == 1: p = p.flatten()

    return p


class AverageMeter(object):
    """ Will calculate running micro and macro averages for various
    (error/efficiency) rates.
    """
    def __init__(self, metric_name):
        self.numerators, self.denominators = list(), list()
        self.metric_name = metric_name

    def add(self, top, bottom):
        self.numerators.append(top)
        self.denominators.append(bottom)

    def get_macro_average(self):
        scores = [float(self.numerators[i]) / self.denominators[i] \
                            for i in range(len(self.denominators))]
        return self.get_average(scores)

    def get_micro_average(self):
        return float(sum(self.numerators)) / sum(self.denominators)

    # accepts a list and returns average
    def get_average(self, l):
        return sum(l) / float(len(l))

    def reset(self):
        self.numerators, self.denominators = list(), list()

    def display_results(self, loglevel=logging.INFO):
        print_rank("{} Macro average: {}".format(self.metric_name,
                                                self.get_macro_average()), loglevel)
        print_rank("{} Micro average: {}".format(self.metric_name,
                                                self.get_micro_average()), loglevel)


def make_lr_scheduler(annealing_config, optimizer, num_batches=1):
    """Set learning rate scheduler."""

    annealing_config = copy.deepcopy(annealing_config)
    annealing_type = annealing_config.pop("type")

    # per epoch or per iter
    step_interval='epoch'
    if "step_interval" in annealing_config:
        step_interval = annealing_config.pop("step_interval")

    if annealing_type == "step_lr":
        # convert epoch steps to iter steps
        # expochs can also be floats like 1.5
        if step_interval == "epoch":
            annealing_config["step_size"] = int(num_batches * \
                                    annealing_config["step_size"])
        lr_scheduler =  StepLR(optimizer=optimizer,
                                **annealing_config)
    elif annealing_type == "multi_step_lr":
        # convert epoch steps to iter steps
        if step_interval == "epoch":
            annealing_config["milestones"] = [int(i * num_batches) for i in annealing_config["milestones"]]
        lr_scheduler =  MultiStepLR(optimizer=optimizer,
                                **annealing_config)
    elif annealing_type == "rampup-keep-expdecay-keep":
        # emulate SpecAugment scheduling
        lr_scheduler =  RampupKeepExpdecayKeepLRScheduler(optimizer=optimizer,
                                        **annealing_config)
    elif annealing_type == 'val_loss':
        lr_scheduler =  ReduceLROnPlateau(optimizer,
                                        **annealing_config)
    else:
        raise ValueError("{} LR scheduler not supported".format(
                                                annealing_type))
    return lr_scheduler


class RampupKeepExpdecayKeepLRScheduler(torch.optim.lr_scheduler._LRScheduler):
    """Implements the LR schedule described in the specaugment paper."""

    def __init__(self, optimizer, peak_lr=0.001, floor_lr=0.00001, sr=1000, si=40000, sf=160000, last_epoch=-1):
        assert(peak_lr>=floor_lr)
        self.peak_lr = peak_lr
        self.floor_lr = floor_lr
        assert(sr<=si)
        assert(si<=sf)
        self.sr = sr
        self.si = si
        self.sf = sf
        self.gamma = math.log(self.floor_lr/self.peak_lr)/(float(self.sf-self.si))
        print('self.gamma')
        print(self.gamma)
        self.step_count = 0
        super(RampupKeepExpdecayKeepLRScheduler, self).__init__(optimizer, last_epoch=last_epoch)

    def step(self, epoch=None):
        for p, lr in zip(self.optimizer.param_groups, self.get_lr()):
            p['lr'] = lr
        self.step_count += 1

    def get_lr(self):
        lr = self.floor_lr
        if self.step_count < self.sr:
            # linear ramp up
            lr = self.peak_lr * float(self.step_count) / float(self.sr)
        elif self.step_count < self.si:
            # keep peak_lr
            lr = self.peak_lr
        elif self.step_count < self.sf:
            # exponential decay from peak_lr to floor_lr
            lr = self.peak_lr * math.exp(self.gamma * (float(self.step_count-self.si)))

        return [lr for base_lr in self.base_lrs]


class ScheduledSamplingScheduler():
    """ Implementing the schedule sampling rate schedule.

    0 - ramp_start          = initial_rate
    ramp_start - ramp_end   = {linearly increase to final_rate}
    ramp_end - infinity     = final_rate
    """

    def __init__(self, model, ramp_start, ramp_stop,
                            initial_rate, final_rate):
        self.model = model
        self.ramp_start = ramp_start
        self.ramp_stop = ramp_stop
        self.initial_rate = initial_rate
        self.final_rate = final_rate
        self.iter = 0

    def step(self):
        if self.iter < self.ramp_start:
            self.model.scheduled_sampling_rate = self.initial_rate
        elif self.iter >= self.ramp_start and self.iter <= self.ramp_stop:
            self.model.scheduled_sampling_rate = self.initial_rate + (self.final_rate - self.initial_rate) * ( (self.iter - self.ramp_start) / (self.ramp_stop - self.ramp_start))
        else:
            self.model.scheduled_sampling_rate = self.final_rate

        self.model.scheduled_sampling = (self.model.scheduled_sampling_rate != 0)
        self.iter += 1

    def state_dict(self):
        return {key: value for key, value in self.__dict__.items() if key != 'model'}

    def load_state_dict(self, state_dict):
        self.__dict__.update(state_dict)


class NBestTaskScheduler():
    """ Implementing the scheduler for multi-task training.

    num_tasks[0]: 0                     <= i < iteration_per_task[0]
    num_tasks[1]: iteration_per_task[0] <= i < iteration_per_task[1]
    """
    def __init__(self, num_tasks, iteration_per_task):
        assert len(num_tasks) == len(iteration_per_task), "Mismatched length {}!={}".format(len(num_tasks), len(iteration_per_task))
        self.iter = 0
        self.stagex = 0
        self.num_tasks = num_tasks
        self.iteration_per_task = iteration_per_task

    def current_num_tasks(self):
        return self.num_tasks[self.stagex]

    def no_label_updates(self):
        """Return how many times transcription must be updated."""
        return (self.iter // self.iteration_per_task[-1]) + 1

    def set_iteration_no(self, iter_no):
        self.iter = iter_no

    def step(self):
        print_rank("Iter={}: #tasks {} at stage {}".format(self.iter, self.current_num_tasks(), self.stagex))
        local_iter = self.iter % self.iteration_per_task[-1]
        if local_iter == 0:
            self.stagex = 0
        elif local_iter >= self.iteration_per_task[self.stagex]:
            self.stagex += 1

        self.iter += 1


# Logging and write-to-disk utilities

def init_logging(log_dir, loglevel=logging.DEBUG):
    """Initialize logging"""
    
    os.makedirs(log_dir, exist_ok=True)    
    log_file = os.path.join(log_dir, "log.out")
    logging.basicConfig(filename=log_file,
                        level=loglevel)
    handler = logging.StreamHandler(stream=sys.stdout)
    logging.getLogger().addHandler(handler)


def print_cuda_stats():
    if torch.cuda.is_available():
        print_rank("torch.cuda.memory_allocated(): {}".format(torch.cuda.memory_allocated()))
        print_rank("torch.cuda.memory_cached(): {}".format(torch.cuda.memory_cached()))
        print_rank("torch.cuda.synchronize(): {}".format(torch.cuda.synchronize()))
    else:
        print_rank("No CUDA GPU available")


def print_rank(str, loglevel=logging.INFO):

    str = "{} : {}".format(time.ctime(), str)
    logging.log(loglevel, str)

def print_profiler(profiler, loglevel=logging.INFO):
    memfile = io.StringIO()
    pstats.Stats(profiler, stream=memfile) \
        .strip_dirs() \
        .sort_stats(pstats.SortKey.CUMULATIVE) \
        .print_stats(20)                    
    for l in memfile.getvalue().split('\n'):
        print_rank(l, loglevel=loglevel)
    memfile.close()


def write_yaml(save_path, config):
    with open(save_path, 'w', encoding='utf8') as yaml_file:
        yaml.dump(config, yaml_file, default_flow_style=False)

def torch_save(save_path, state_or_model):
    torch.save(state_or_model, save_path)

def write_tokens(save_path, token_list):
    with open(save_path, 'w', encoding='utf8') as token_fid:
        for w in token_list:
            token_fid.write(w + '\n')


def try_except_save(save_fn, **kwargs):
    """ Try to write it out 3 times."""

    max_attempts = 3
    for attempt in range(1, max_attempts+1):
        try:
            save_fn(**kwargs)
        except IOError:
            print_rank("Write operation failed on {} attempt".format(attempt))
        else:
            print_rank("Write operation succeeded in {} attempts".format(attempt))
            return


def write_nbest_jsonl(uttid2jsonl, uttid2hypos, uttid2scores, outputpath, nbest, orgpath="", newpath=""):
    """ Dump a json list file with n-best hypos."""

    newjsonl = []
    for uttid, jsonl in uttid2jsonl.items():
        if not uttid in uttid2hypos:
            print("Missing utterance {} in results".format(uttid))
            continue
        hypos  = uttid2hypos[uttid]
        if nbest > 1:
            # re-normalize the probablity from N-best: ignoring the events out of the N-best hypos
            weights = uttid2scores[uttid]
            if len(weights) < nbest:
                for n in range(len(weights), nbest):
                    print_rank("Mising {}-th best result in {}. Appending {}".format(n, uttid, weights[0]))
                    weights = np.append(weights, np.array(weights[0]))

            weights = softmax(weights[0:nbest]) if uttid in uttid2scores else np.ones(nbest) / nbest
            # Filling the missing hypos with the 1st best candidate
            for n in range(min(nbest, len(hypos))):
                newjson = copy.deepcopy(jsonl)
                newjson["id"]   = "{}-{}".format(uttid, n)
                newjson["text"] = " ".join(hypos[n])
                newjson["loss_weight"] = weights[n]
        else:
            newjson = copy.deepcopy(jsonl)
            newjson["id"]   = uttid
            newjson["text"] = " ".join(hypos[0])

        newjsonl.append(newjson)

    with open(outputpath, 'w') as ofp:
        for jsonl in newjsonl:
            jsonl["wav"] = jsonl["wav"].replace(orgpath, newpath)
            ofp.write("{}\n".format(json.dumps(jsonl)))

    return True


def write_multitask_jsonl(uttid2jsonl, uttid2hypos, uttid2scores, outputpath, nbest, orgpath="", newpath=""):
    """ Dump a json list file with n-best hypos."""

    if nbest==1:
        return write_nbest_jsonl(uttid2jsonl, uttid2hypos, uttid2scores, outputpath, nbest, orgpath, newpath)

    newjsonl = []
    for uttid, jsonl in uttid2jsonl.items():
        if not uttid in uttid2hypos:
            print_rank("Missing utterance {} in results".format(uttid))
            continue
        hypos  = uttid2hypos[uttid]
        # re-normalize the probablity from N-best: ignoring the events out of the N-best hypos
        weights = uttid2scores[uttid]
        if len(weights) < nbest:
            for n in range(len(weights), nbest):
                print_rank("Mising {}-th best result in {}. Appending {}".format(n, uttid, weights[0]))
                weights = np.append(weights, np.array(weights[0]))

        weights = softmax(weights[0:nbest]) if uttid in uttid2scores else np.ones(nbest) / nbest
        newjson = jsonl
        newjson["task_weights"] = weights.tolist()
        assert len(weights) == nbest, "{}: Weight length does not match: {} != {}".format(uttid, len(weights), nbest)
        newjson["text"] = " ".join(hypos[0])
        newjson["subtextl"] = []
        all_null_results = newjson["text"] == ""
        for n in range(1, nbest):
            if n < len(hypos):
                newjson["subtextl"].append(" ".join(hypos[n]))
            else:
                print_rank("Mising {}-th best result in {}".format(n, uttid))
                newjson["subtextl"].append(" ".join(hypos[0]))
            if all_null_results is True:
                all_null_results = newjson["subtextl"][n-1] == ""

        assert len(newjson["subtextl"]) == nbest-1, "#sub-rec results does not match: {} != {}".format(len(newjson["subtextl"]), nbest-1)
        # take meaningful results only and ignore null string
        if all_null_results is False:
            newjsonl.append(newjson)
        else:
            print_rank("Skip {}: Invalid result '{}'".format(uttid, newjson["text"]))

    with open(outputpath, 'w') as ofp:
        for jsonl in newjsonl:
            jsonl["wav"] = jsonl["wav"].replace(orgpath, newpath)
            ofp.write("{}\n".format(json.dumps(jsonl)))

    return True


def load_eval_result_jsonl(resultjsonl, uttid2hypos=OrderedDict(), uttid2scores=OrderedDict(), dumpfp=None, dump_msg="RESULT: "):
    """Load the result JSON list file dumped by Evaluator().

    Args:

    resultjsonl (str): input JSON list file
    uttid2hypos: (dict): maps the utterance ID to text, [uttid] = hypothesis text
    uttid2scores (dict): maps the utterance ID to a confidence score, [uttid] = confidence score(s)
    dumpfp (file): pointer where the WERs will be written out
    dump_msg (str): message string before the WER result
    """
    total_weighted_best_wer   = 0
    total_weighted_oracle_wer = 0
    total_length              = 0
    with open(resultjsonl) as resultfp:
        for line in resultfp:
            elems = json.loads(line.strip())
            if "hypothesis" in elems:
                uttid = elems["utt_id"]
                params = list(elems["hypothesis"].keys())
                uttid2hypos[uttid] = elems["hypothesis"][params[0]]
                if "nbest_model_scores" in elems:
                    uttid2scores[uttid] = np.array(elems["nbest_model_scores"][params[0]])
            else:
                print_rank("Result: {}".format(line.strip()))
                if dumpfp is not None:
                    dumpfp.write("{}{}\n".format(dump_msg, line.strip()))
                params = list(elems["wer-"].keys())
                total_weighted_best_wer   += elems["wer-"][params[0]]["best_wer"] * elems["wer-"][params[0]]["total_length"]
                total_weighted_oracle_wer += elems["wer-"][params[0]]["oracle_wer"] * elems["wer-"][params[0]]["total_length"]
                total_length += elems["wer-"][params[0]]["total_length"]

    return uttid2hypos, uttid2scores, total_weighted_best_wer, total_weighted_oracle_wer, total_length


def find_pretrained_model(model_path, config):
    """"Load a a pre-trained/seed model if provided in config file."""
    output_file=None

    if config.get("pretrained_model_path", None):
        output_file=config["pretrained_model_path"]

    print_rank('Loading Model from: {}'.format(output_file), loglevel=logging.INFO)
    return output_file


def flatten_grads_model(learner) -> np.ndarray:
    """Given a model flatten all params and return as np array."""

    return np.concatenate([w.grad.detach().clone().cpu().numpy().flatten() for w in learner.parameters()])

def flatten_grads_array(param_array)->np.array:
    """Given a model flatten all params and return as np array."""

    N=len(param_array)
    tmp_array=[]
    for i in range(N):
        tmp_array.append(np.concatenate([w.detach().clone().cpu().numpy().flatten() for w in param_array[i]]))
    return np.array(tmp_array)

def dist_weights_to_model(weights, parameters):
    """Updates the model parameters with the supplied weights."""

    offset = 0
    for param in parameters:
        new_size = functools.reduce(lambda x, y: x*y, param.shape)
        current_data = weights[offset:offset + new_size]
        param.data[:] = torch.from_numpy(current_data.reshape(param.shape)).to(param.data)
        offset += new_size

def dist_params_to_model(grads, model):
    """Updates the model gradients (Corresponding to each param) with the supplied grads."""

    offset = 0
    for p in model:
        new_size = functools.reduce(lambda x, y: x*y, p.data.shape)
        current_data = torch.from_numpy(grads[offset:offset + new_size].reshape(p.data.shape)).type(p.data.dtype).to(p)
        p.grad = current_data if p.grad==None else p.grad+current_data
        offset += new_size
        
def reshape_params_to_model(grads, model):
    """ Given Gradients and a model architecture this method updates the model gradients (Corresponding to each param)
    with the supplied grads """
    offset = 0
    reshaped_grads=[]
    for p in model:
        new_size = functools.reduce(lambda x, y: x*y, p.shape)
        current_data = torch.from_numpy(grads[offset:offset + new_size].reshape(p.shape)).type(p.dtype).to(p)
        reshaped_grads.append(current_data)
        offset += new_size
    return reshaped_grads

def to_device(x):
    return x.cuda() if torch.cuda.is_available() else x

def update_json_log(log_path, status_info):
    """Update J-son elements"""
    
    elems = {}
    if os.path.exists(log_path):
        with open(log_path, 'r') as logfp: 
            elems = json.load(logfp)
            print_rank("Loaded status info: {}".format(elems))

    for k, v in status_info.items():
        elems[k] = v

    with open(log_path, 'w') as logfp:
        json.dump(elems, logfp)
        print_rank("Updated status info: {}".format(elems))


def scrub_empty_clients(data_strct):
    """ Clean empty clients in the data structure"""

    users_out = []
    user_data_out = {}
    num_samples_out = []
    if 'user_data_label' in data_strct.keys():
        user_data_label_out = {}
    for ix, user in enumerate(data_strct['users']):
        if data_strct['num_samples'][ix] > 0:
            users_out.append(user)
            user_data_out[user] = data_strct['user_data'][user]
            num_samples_out.append(data_strct['num_samples'][ix])
            if 'user_data_label' in data_strct.keys():
                user_data_label_out[user] = data_strct['user_data_label'][user]

    if ('user_data_label' in data_strct.keys()):
        return edict({'users': users_out, 'user_data': user_data_out, 'num_samples': num_samples_out, 'user_data_label': user_data_label_out})
    else:
        return edict({'users': users_out, 'user_data': user_data_out, 'num_samples': num_samples_out})


def compute_grad_cosines(grads, model_grad):
    def compute_cosine(g, m):
        tot = 0
        g2 = 0
        m2 = 0
        for p1, p2 in zip(g, m):
            tot += torch.mul(p1, p2.to('cpu')).sum().item()
            g2 += torch.mul(p1, p1).sum().item()
            m2 += torch.mul(p2, p2).sum().item()
        return tot / (np.sqrt(g2) * np.sqrt(m2)) if g2 > 0 and m2 > 0 else 0
    return [compute_cosine(g, model_grad) for g in grads]

# Personalization Routines
def convex_inference(model_global, model_personal, alpha):
    """" Model interpolation """
    targets= torch.tensor(model_global['labels'])
    probs = alpha*model_personal['probabilities']+(1-alpha)*model_global['probabilities']
    probs= torch.argmax(torch.tensor(probs), dim=1)
    return torch.mean((probs == targets).float()).detach().cpu().item()

def alpha_update(model_global, model_personal, alpha, eta):
    """" Training convex model interpolation weight. """
    grad_alpha = 0.0
    for l_params, p_params in zip(model_global.parameters(), model_personal.parameters()):
        dif = p_params.data - l_params.data
        grad = alpha * p_params.grad + (1 - alpha) * l_params.grad
        grad_alpha += dif.view(-1).T.dot(grad.view(-1))

    grad_alpha += 0.02 * alpha
    alpha_n = alpha - eta * grad_alpha
    alpha_n = np.clip(alpha_n.detach().cpu().item(), 0.0001, 0.9999)

    return alpha_n if np.isfinite(alpha_n) else 0.75

# Semi-supervision Routines
def get_label_VAT(local_logits, server_logits, thre, comp):
    """" Returns the estimated labels to SemiSupervision Task """
    bs = np.shape(local_logits)[0]
    logit_dim = np.shape(local_logits)[1]
    labels = []
    idx = []
    var = []

    if comp == 'var':
        local_var = torch.var(local_logits, dim=1)
        server_var = torch.var(server_logits, dim=1)

        server = 0
        local = 0
        ratio = 0

        for bs_i in range(bs):
            if local_var[bs_i] >= server_var[bs_i] and torch.max(local_logits[bs_i]) > thre:
                labels.append(torch.argmax(local_logits[bs_i]))
                idx.append(bs_i)
                var.append((server_var[bs_i]) / (local_var[bs_i]))
                local += 1
            if local_var[bs_i] < server_var[bs_i] and torch.max(server_logits[bs_i]) > thre:
                labels.append(torch.argmax(server_logits[bs_i]))
                idx.append(bs_i)
                var.append((local_var[bs_i]) / (server_var[bs_i]))
                server += 1

        if len(labels) != 0:
            labels = torch.stack(labels)
            var = torch.stack(var)
            ratio = server / (server + local)

    elif comp == 'ent':
        local_var = scipyst.entropy(local_logits.cpu(), axis=1)+0.00001
        server_var = scipyst.entropy(server_logits.cpu(), axis=1)+0.00001

        server = 0
        local = 0
        ratio = 0

        for bs_i in range(bs):
            if 1/local_var[bs_i]>= 1/server_var[bs_i] and torch.max(local_logits[bs_i])>thre:
                labels.append(torch.argmax(local_logits[bs_i]))
                idx.append(bs_i)
                var.append((1/server_var[bs_i])/(1/local_var[bs_i]))
                local += 1
            if 1/local_var[bs_i]< 1/server_var[bs_i] and torch.max(server_logits[bs_i])>thre:
                labels.append(torch.argmax(server_logits[bs_i]))
                idx.append(bs_i)
                var.append((1/local_var[bs_i])/(1/server_var[bs_i]))
                server += 1

        if len(labels) != 0:
            labels = torch.stack(labels)
            #var = torch.stack(var)
            ratio = server/(server+local)

    return labels, idx, var, ratio