Full Code of megvii-research/mdistiller for AI

master a08d46f10d61 cached

117 files

626.8 KB

323.7k tokens

408 symbols

1 requests

Download .txt

Showing preview only (662K chars total). Download the full file or copy to clipboard to get everything.

Repository: megvii-research/mdistiller
Branch: master
Commit: a08d46f10d61
Files: 117
Total size: 626.8 KB

Directory structure:
gitextract_b7kwa80j/

├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── configs/
│   ├── cifar100/
│   │   ├── at.yaml
│   │   ├── crd.yaml
│   │   ├── dkd/
│   │   │   ├── res110_res32.yaml
│   │   │   ├── res32x4_res8x4.yaml
│   │   │   ├── res32x4_shuv1.yaml
│   │   │   ├── res32x4_shuv2.yaml
│   │   │   ├── res50_mv2.yaml
│   │   │   ├── res56_res20.yaml
│   │   │   ├── vgg13_mv2.yaml
│   │   │   ├── vgg13_vgg8.yaml
│   │   │   ├── wrn40_2_shuv1.yaml
│   │   │   ├── wrn40_2_wrn_16_2.yaml
│   │   │   └── wrn40_2_wrn_40_1.yaml
│   │   ├── dot/
│   │   │   ├── res32x4_res8x4.yaml
│   │   │   ├── res32x4_shuv2.yaml
│   │   │   └── vgg13_vgg8.yaml
│   │   ├── fitnet.yaml
│   │   ├── kd.yaml
│   │   ├── kdsvd.yaml
│   │   ├── nst.yaml
│   │   ├── ofd.yaml
│   │   ├── pkt.yaml
│   │   ├── reviewkd.yaml
│   │   ├── rkd.yaml
│   │   ├── sp.yaml
│   │   ├── vanilla.yaml
│   │   └── vid.yaml
│   ├── imagenet/
│   │   ├── r34_r18/
│   │   │   ├── at.yaml
│   │   │   ├── crd.yaml
│   │   │   ├── dkd.yaml
│   │   │   ├── dot.yaml
│   │   │   ├── kd.yaml
│   │   │   └── reviewkd.yaml
│   │   └── r50_mv1/
│   │       ├── at.yaml
│   │       ├── crd.yaml
│   │       ├── dkd.yaml
│   │       ├── dot.yaml
│   │       ├── kd.yaml
│   │       ├── ofd.yaml
│   │       └── reviewkd.yaml
│   └── tiny_imagenet/
│       └── dot/
│           ├── r18_mv2.yaml
│           └── r18_shuv2.yaml
├── detection/
│   ├── README.md
│   ├── __init__.py
│   ├── configs/
│   │   ├── Base-Distillation.yaml
│   │   ├── DKD/
│   │   │   ├── DKD-MV2-R50.yaml
│   │   │   ├── DKD-R18-R101.yaml
│   │   │   ├── DKD-R50-R101.yaml
│   │   │   ├── ReviewDKD-MV2-R50.yaml
│   │   │   ├── ReviewDKD-R18-R101.yaml
│   │   │   └── ReviewDKD-R50-R101.yaml
│   │   └── ReviewKD/
│   │       ├── ReviewKD-MV2-R50-Mask.yaml
│   │       ├── ReviewKD-MV2-R50.yaml
│   │       ├── ReviewKD-R18-R101-Mask.yaml
│   │       ├── ReviewKD-R18-R101.yaml
│   │       ├── ReviewKD-R50-R101-Mask.yaml
│   │       └── ReviewKD-R50-R101.yaml
│   ├── model/
│   │   ├── __init__.py
│   │   ├── backbone/
│   │   │   ├── __init__.py
│   │   │   ├── fpn.py
│   │   │   ├── mobilenetv2.py
│   │   │   └── resnet.py
│   │   ├── config.py
│   │   ├── rcnn.py
│   │   ├── reviewkd.py
│   │   └── teacher/
│   │       ├── __init__.py
│   │       └── teacher.py
│   └── train_net.py
├── mdistiller/
│   ├── __init__.py
│   ├── dataset/
│   │   ├── __init__.py
│   │   ├── cifar100.py
│   │   ├── imagenet.py
│   │   └── tiny_imagenet.py
│   ├── distillers/
│   │   ├── AT.py
│   │   ├── CRD.py
│   │   ├── DKD.py
│   │   ├── FitNet.py
│   │   ├── KD.py
│   │   ├── KDSVD.py
│   │   ├── NST.py
│   │   ├── OFD.py
│   │   ├── PKT.py
│   │   ├── RKD.py
│   │   ├── ReviewKD.py
│   │   ├── SP.py
│   │   ├── VID.py
│   │   ├── __init__.py
│   │   ├── _base.py
│   │   └── _common.py
│   ├── engine/
│   │   ├── __init__.py
│   │   ├── cfg.py
│   │   ├── dot.py
│   │   ├── trainer.py
│   │   └── utils.py
│   └── models/
│       ├── __init__.py
│       ├── cifar/
│       │   ├── ShuffleNetv1.py
│       │   ├── ShuffleNetv2.py
│       │   ├── __init__.py
│       │   ├── mobilenetv2.py
│       │   ├── mv2_tinyimagenet.py
│       │   ├── resnet.py
│       │   ├── resnetv2.py
│       │   ├── vgg.py
│       │   └── wrn.py
│       └── imagenet/
│           ├── __init__.py
│           ├── mobilenetv1.py
│           └── resnet.py
├── requirements.txt
├── setup.py
└── tools/
    ├── eval.py
    ├── train.py
    └── visualizations/
        ├── correlation.ipynb
        └── tsne.ipynb

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitattributes
================================================
* linguist-language=Python


================================================
FILE: .gitignore
================================================
.idea/
data
data/
output
output*/
detection/datasets
detection/output
detection/output*/
ckpts/
*.pth
*.t7
tmp*.py

*.pdf


# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

# wandb
wandb/

# download ckpts
download_ckpts/

================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2022 MEGVII Research

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.



detectron2

Copyright 2020 - present, Facebook, Inc

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.


RepDistiller

Copyright (c) 2020, Yonglong Tian


================================================
FILE: README.md
================================================
<div align=center><img src=".github/mdistiller.png" width="40%" ><div align=left>

This repo is

(1) a PyTorch library that provides classical knowledge distillation algorithms on mainstream CV benchmarks,

(2) the official implementation of the CVPR-2022 paper: [Decoupled Knowledge Distillation](https://arxiv.org/abs/2203.08679).

(3) the official implementation of the ICCV-2023 paper: [DOT: A Distillation-Oriented Trainer](https://openaccess.thecvf.com/content/ICCV2023/papers/Zhao_DOT_A_Distillation-Oriented_Trainer_ICCV_2023_paper.pdf).


# DOT: A Distillation-Oriented Trainer

### Framework

<div style="text-align:center"><img src=".github/dot.png" width="80%" ></div>

### Main Benchmark Results

On CIFAR-100:

| Teacher <br> Student | ResNet32x4 <br> ResNet8x4| VGG13 <br> VGG8| ResNet32x4 <br>  ShuffleNet-V2|
|:---------------:|:-----------------:|:-----------------:|:-----------------:|
| KD | 73.33 | 72.98 | 74.45 |
| **KD+DOT** | **75.12** | **73.77** | **75.55** |

On Tiny-ImageNet:

| Teacher <br> Student |ResNet18 <br> MobileNet-V2|ResNet18 <br> ShuffleNet-V2|
|:---------------:|:-----------------:|:-----------------:|
| KD | 58.35 | 62.26 | 
| **KD+DOT** | **64.01** | **65.75** |

On ImageNet:

| Teacher <br> Student |ResNet34 <br> ResNet18|ResNet50 <br> MobileNet-V1|
|:---------------:|:-----------------:|:-----------------:|
| KD | 71.03 | 70.50 | 
| **KD+DOT** | **71.72** | **73.09** |

# Decoupled Knowledge Distillation

### Framework & Performance

<div style="text-align:center"><img src=".github/dkd.png" width="80%" ></div>

### Main Benchmark Results

On CIFAR-100:


| Teacher <br> Student |ResNet56 <br> ResNet20|ResNet110 <br> ResNet32| ResNet32x4 <br> ResNet8x4| WRN-40-2 <br> WRN-16-2| WRN-40-2 <br> WRN-40-1 | VGG13 <br> VGG8|
|:---------------:|:-----------------:|:-----------------:|:-----------------:|:------------------:|:------------------:|:--------------------:|
| KD | 70.66 | 73.08 | 73.33 | 74.92 | 73.54 | 72.98 |
| **DKD** | **71.97** | **74.11** | **76.32** | **76.23** | **74.81** | **74.68** |


| Teacher <br> Student |ResNet32x4 <br> ShuffleNet-V1|WRN-40-2 <br> ShuffleNet-V1| VGG13 <br> MobileNet-V2| ResNet50 <br> MobileNet-V2| ResNet32x4 <br> MobileNet-V2|
|:---------------:|:-----------------:|:-----------------:|:-----------------:|:------------------:|:------------------:|
| KD | 74.07 | 74.83 | 67.37 | 67.35 | 74.45 |
| **DKD** | **76.45** | **76.70** | **69.71** | **70.35** | **77.07** |


On ImageNet:

| Teacher <br> Student |ResNet34 <br> ResNet18|ResNet50 <br> MobileNet-V1|
|:---------------:|:-----------------:|:-----------------:|
| KD | 71.03 | 70.50 | 
| **DKD** | **71.70** | **72.05** |

# MDistiller

### Introduction

MDistiller supports the following distillation methods on CIFAR-100, ImageNet and MS-COCO:
|Method|Paper Link|CIFAR-100|ImageNet|MS-COCO|
|:---:|:---:|:---:|:---:|:---:|
|KD| <https://arxiv.org/abs/1503.02531> |&check;|&check;| |
|FitNet| <https://arxiv.org/abs/1412.6550> |&check;| | |
|AT| <https://arxiv.org/abs/1612.03928> |&check;|&check;| |
|NST| <https://arxiv.org/abs/1707.01219> |&check;| | |
|PKT| <https://arxiv.org/abs/1803.10837> |&check;| | |
|KDSVD| <https://arxiv.org/abs/1807.06819> |&check;| | |
|OFD| <https://arxiv.org/abs/1904.01866> |&check;|&check;| |
|RKD| <https://arxiv.org/abs/1904.05068> |&check;| | |
|VID| <https://arxiv.org/abs/1904.05835> |&check;| | |
|SP| <https://arxiv.org/abs/1907.09682> |&check;| | |
|CRD| <https://arxiv.org/abs/1910.10699> |&check;|&check;| |
|ReviewKD| <https://arxiv.org/abs/2104.09044> |&check;|&check;|&check;|
|DKD| <https://arxiv.org/abs/2203.08679> |&check;|&check;|&check;|


### Installation

Environments:

- Python 3.6
- PyTorch 1.9.0
- torchvision 0.10.0

Install the package:

```
sudo pip3 install -r requirements.txt
sudo python3 setup.py develop
```

### Getting started

0. Wandb as the logger

- The registeration: <https://wandb.ai/home>.
- If you don't want wandb as your logger, set `CFG.LOG.WANDB` as `False` at `mdistiller/engine/cfg.py`.

1. Evaluation

- You can evaluate the performance of our models or models trained by yourself.

- Our models are at <https://github.com/megvii-research/mdistiller/releases/tag/checkpoints>, please download the checkpoints to `./download_ckpts`

- If test the models on ImageNet, please download the dataset at <https://image-net.org/> and put them to `./data/imagenet`

  ```bash
  # evaluate teachers
  python3 tools/eval.py -m resnet32x4 # resnet32x4 on cifar100
  python3 tools/eval.py -m ResNet34 -d imagenet # ResNet34 on imagenet
  
  # evaluate students
  python3 tools/eval.p -m resnet8x4 -c download_ckpts/dkd_resnet8x4 # dkd-resnet8x4 on cifar100
  python3 tools/eval.p -m MobileNetV1 -c download_ckpts/imgnet_dkd_mv1 -d imagenet # dkd-mv1 on imagenet
  python3 tools/eval.p -m model_name -c output/your_exp/student_best # your checkpoints
  ```


2. Training on CIFAR-100

- Download the `cifar_teachers.tar` at <https://github.com/megvii-research/mdistiller/releases/tag/checkpoints> and untar it to `./download_ckpts` via `tar xvf cifar_teachers.tar`.

  ```bash
  # for instance, our DKD method.
  python3 tools/train.py --cfg configs/cifar100/dkd/res32x4_res8x4.yaml

  # you can also change settings at command line
  python3 tools/train.py --cfg configs/cifar100/dkd/res32x4_res8x4.yaml SOLVER.BATCH_SIZE 128 SOLVER.LR 0.1
  ```

3. Training on ImageNet

- Download the dataset at <https://image-net.org/> and put them to `./data/imagenet`

  ```bash
  # for instance, our DKD method.
  python3 tools/train.py --cfg configs/imagenet/r34_r18/dkd.yaml
  ```

4. Training on MS-COCO

- see [detection.md](detection/README.md)


5. Extension: Visualizations

- Jupyter notebooks: [tsne](tools/visualizations/tsne.ipynb) and [correlation_matrices](tools/visualizations/correlation.ipynb)


### Custom Distillation Method

1. create a python file at `mdistiller/distillers/` and define the distiller
  
  ```python
  from ._base import Distiller

  class MyDistiller(Distiller):
      def __init__(self, student, teacher, cfg):
          super(MyDistiller, self).__init__(student, teacher)
          self.hyper1 = cfg.MyDistiller.hyper1
          ...

      def forward_train(self, image, target, **kwargs):
          # return the output logits and a Dict of losses
          ...
      # rewrite the get_learnable_parameters function if there are more nn modules for distillation.
      # rewrite the get_extra_parameters if you want to obtain the extra cost.
    ...
  ```

2. regist the distiller in `distiller_dict` at `mdistiller/distillers/__init__.py`

3. regist the corresponding hyper-parameters at `mdistiller/engines/cfg.py`

4. create a new config file and test it.

# Citation

If this repo is helpful for your research, please consider citing the paper:

```BibTeX
@article{zhao2022dkd,
  title={Decoupled Knowledge Distillation},
  author={Zhao, Borui and Cui, Quan and Song, Renjie and Qiu, Yiyu and Liang, Jiajun},
  journal={arXiv preprint arXiv:2203.08679},
  year={2022}
}
@article{zhao2023dot,
  title={DOT: A Distillation-Oriented Trainer},
  author={Zhao, Borui and Cui, Quan and Song, Renjie and Liang, Jiajun},
  journal={arXiv preprint arXiv:2307.08436},
  year={2023}
}
```

# License

MDistiller is released under the MIT license. See [LICENSE](LICENSE) for details.

# Acknowledgement

- Thanks for CRD and ReviewKD. We build this library based on the [CRD's codebase](https://github.com/HobbitLong/RepDistiller) and the [ReviewKD's codebase](https://github.com/dvlab-research/ReviewKD).

- Thanks Yiyu Qiu and Yi Shi for the code contribution during their internship in MEGVII Technology.

- Thanks Xin Jin for the discussion about DKD.


================================================
FILE: configs/cifar100/at.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "at,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "AT"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"



================================================
FILE: configs/cifar100/crd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "crd,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "CRD"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
  TRAINER: "crd" 


================================================
FILE: configs/cifar100/dkd/res110_res32.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,res110,res32"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "DKD"
  TEACHER: "resnet110"
  STUDENT: "resnet32"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
DKD:
  BETA: 2.0


================================================
FILE: configs/cifar100/dkd/res32x4_res8x4.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "DKD"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"



================================================
FILE: configs/cifar100/dkd/res32x4_shuv1.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,res32x4,shuv1"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "DKD"
  TEACHER: "resnet32x4"
  STUDENT: "ShuffleV1"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.01
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"



================================================
FILE: configs/cifar100/dkd/res32x4_shuv2.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,res32x4,shuv2"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "DKD"
  TEACHER: "resnet32x4"
  STUDENT: "ShuffleV2"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.01
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"



================================================
FILE: configs/cifar100/dkd/res50_mv2.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,res50,mv2"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "DKD"
  TEACHER: "ResNet50"
  STUDENT: "MobileNetV2"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.01
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"



================================================
FILE: configs/cifar100/dkd/res56_res20.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,res56,res20"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "DKD"
  TEACHER: "resnet56"
  STUDENT: "resnet20"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
DKD:
  BETA: 2.0


================================================
FILE: configs/cifar100/dkd/vgg13_mv2.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,vgg13,mv2"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "DKD"
  TEACHER: "vgg13"
  STUDENT: "MobileNetV2"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.01
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
DKD:
  BETA: 6.0



================================================
FILE: configs/cifar100/dkd/vgg13_vgg8.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,vgg13,vgg8"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "DKD"
  TEACHER: "vgg13"
  STUDENT: "vgg8"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
DKD:
  BETA: 6.0


================================================
FILE: configs/cifar100/dkd/wrn40_2_shuv1.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,wrn_40_2,shuv1"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "DKD"
  TEACHER: "wrn_40_2"
  STUDENT: "ShuffleV1"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.01
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"



================================================
FILE: configs/cifar100/dkd/wrn40_2_wrn_16_2.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,wrn_40_2,wrn_16_2"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "DKD"
  TEACHER: "wrn_40_2"
  STUDENT: "wrn_16_2"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
DKD:
  BETA: 6.0


================================================
FILE: configs/cifar100/dkd/wrn40_2_wrn_40_1.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,wrn_40_2,wrn_40_1"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "DKD"
  TEACHER: "wrn_40_2"
  STUDENT: "wrn_40_1"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
DKD:
  BETA: 6.0


================================================
FILE: configs/cifar100/dot/res32x4_res8x4.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "kd,dot,res32x4,res8x4"
  PROJECT: "dot_cifar"
DISTILLER:
  TYPE: "KD"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
  TRAINER: "dot"
  DOT:
    DELTA: 0.075


================================================
FILE: configs/cifar100/dot/res32x4_shuv2.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "kd,dot,res32x4,shuv2"
  PROJECT: "dot_cifar"
DISTILLER:
  TYPE: "KD"
  TEACHER: "resnet32x4"
  STUDENT: "ShuffleV2"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.01
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
  TRAINER: "dot"
  DOT:
    DELTA: 0.075


================================================
FILE: configs/cifar100/dot/vgg13_vgg8.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "kd,dot,vgg13,vgg8"
  PROJECT: "dot_cifar"
DISTILLER:
  TYPE: "KD"
  TEACHER: "vgg13"
  STUDENT: "vgg8"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
  TRAINER: "dot"
  DOT:
    DELTA: 0.075


================================================
FILE: configs/cifar100/fitnet.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "fitnet,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "FITNET"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"


================================================
FILE: configs/cifar100/kd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "kd,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "KD"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"


================================================
FILE: configs/cifar100/kdsvd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "kdsvd,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "KDSVD"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"


================================================
FILE: configs/cifar100/nst.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "nst,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "NST"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"


================================================
FILE: configs/cifar100/ofd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "ofd,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "OFD"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"



================================================
FILE: configs/cifar100/pkt.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "pkt,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "PKT"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"


================================================
FILE: configs/cifar100/reviewkd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "reviewkd,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "REVIEWKD"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
REVIEWKD:
  REVIEWKD_WEIGHT: 5.0
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"



================================================
FILE: configs/cifar100/rkd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "rkd,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "RKD"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"


================================================
FILE: configs/cifar100/sp.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "sp,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "SP"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"


================================================
FILE: configs/cifar100/vanilla.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "vanilla,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "NONE"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"


================================================
FILE: configs/cifar100/vid.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "vid,res32x4,res8x4"
  PROJECT: "cifar100_baselines"
DISTILLER:
  TYPE: "VID"
  TEACHER: "resnet32x4"
  STUDENT: "resnet8x4"
SOLVER:
  BATCH_SIZE: 64
  EPOCHS: 240
  LR: 0.05
  LR_DECAY_STAGES: [150, 180, 210]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"


================================================
FILE: configs/imagenet/r34_r18/at.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "at,res34,res18"
  PROJECT: "imagenet_baselines"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "AT"
  TEACHER: "ResNet34"
  STUDENT: "ResNet18"
SOLVER:
  BATCH_SIZE: 512
  EPOCHS: 100
  LR: 0.2
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10


================================================
FILE: configs/imagenet/r34_r18/crd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "crd,res34,res18"
  PROJECT: "imagenet_baselines"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "CRD"
  TEACHER: "ResNet34"
  STUDENT: "ResNet18"
SOLVER:
  BATCH_SIZE: 256
  EPOCHS: 100
  LR: 0.1
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
  TRAINER: "crd" 
CRD:
  FEAT:
    STUDENT_DIM: 512
    TEACHER_DIM: 512
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10


================================================
FILE: configs/imagenet/r34_r18/dkd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,res34,res18"
  PROJECT: "imagenet_baselines"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "DKD"
  TEACHER: "ResNet34"
  STUDENT: "ResNet18"
SOLVER:
  BATCH_SIZE: 512
  EPOCHS: 100
  LR: 0.2
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10
DKD:
  CE_WEIGHT: 1.0
  BETA: 0.5
  T: 1.0
  WARMUP: 1


================================================
FILE: configs/imagenet/r34_r18/dot.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "kd,dot,res34,res18"
  PROJECT: "dot_imagenet"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "KD"
  TEACHER: "ResNet34"
  STUDENT: "ResNet18"
SOLVER:
  BATCH_SIZE: 512
  EPOCHS: 100
  LR: 0.2
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
  TRAINER: "dot"
  DOT:
    DELTA: 0.09
KD:
  TEMPERATURE: 1
  LOSS:
    CE_WEIGHT: 0.5
    KD_WEIGHT: 0.5
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10


================================================
FILE: configs/imagenet/r34_r18/kd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "kd,res34,res18"
  PROJECT: "imagenet_baselines"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "KD"
  TEACHER: "ResNet34"
  STUDENT: "ResNet18"
SOLVER:
  BATCH_SIZE: 512
  EPOCHS: 100
  LR: 0.2
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
KD:
  TEMPERATURE: 1
  LOSS:
    CE_WEIGHT: 0.5
    KD_WEIGHT: 0.5
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10


================================================
FILE: configs/imagenet/r34_r18/reviewkd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "reviewkd,res34,res18"
  PROJECT: "imagenet_baselines"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "REVIEWKD"
  TEACHER: "ResNet34"
  STUDENT: "ResNet18"
SOLVER:
  BATCH_SIZE: 256
  EPOCHS: 100
  LR: 0.1
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10
REVIEWKD:
  CE_WEIGHT: 1.0
  REVIEWKD_WEIGHT: 1.0
  WARMUP_EPOCHS: 1
  SHAPES: [1,7,14,28,56]
  OUT_SHAPES: [1,7,14,28,56]
  IN_CHANNELS: [64,128,256,512,512]
  OUT_CHANNELS: [64,128,256,512,512]
  STU_PREACT: True


================================================
FILE: configs/imagenet/r50_mv1/at.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "at,res50,mobilenetv1"
  PROJECT: "imagenet_baselines"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "AT"
  TEACHER: "ResNet50"
  STUDENT: "MobileNetV1"
SOLVER:
  BATCH_SIZE: 512
  EPOCHS: 100
  LR: 0.2
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10


================================================
FILE: configs/imagenet/r50_mv1/crd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "crd,res50,mobilenetv1"
  PROJECT: "imagenet_baselines"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "CRD"
  TEACHER: "ResNet50"
  STUDENT: "MobileNetV1"
SOLVER:
  BATCH_SIZE: 256
  EPOCHS: 100
  LR: 0.1
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
  TRAINER: "crd" 
CRD:
  FEAT:
    STUDENT_DIM: 1024
    TEACHER_DIM: 2048
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10


================================================
FILE: configs/imagenet/r50_mv1/dkd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "dkd,res50,mobilenetv1"
  PROJECT: "imagenet_baselines"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "DKD"
  TEACHER: "ResNet50"
  STUDENT: "MobileNetV1"
SOLVER:
  BATCH_SIZE: 512
  EPOCHS: 100
  LR: 0.2
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10
DKD:
  CE_WEIGHT: 1.0
  BETA: 2.0
  T: 1.0
  WARMUP: 1


================================================
FILE: configs/imagenet/r50_mv1/dot.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "kd,dot,res50,mobilenetv1"
  PROJECT: "dot_imagenet"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "KD"
  TEACHER: "ResNet50"
  STUDENT: "MobileNetV1"
SOLVER:
  BATCH_SIZE: 512
  EPOCHS: 100
  LR: 0.2
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
  TRAINER: "dot"
  DOT:
    DELTA: 0.09
KD:
  TEMPERATURE: 1
  LOSS:
    CE_WEIGHT: 0.5
    KD_WEIGHT: 0.5
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10


================================================
FILE: configs/imagenet/r50_mv1/kd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "kd,res50,mobilenetv1"
  PROJECT: "imagenet_baselines"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "KD"
  TEACHER: "ResNet50"
  STUDENT: "MobileNetV1"
SOLVER:
  BATCH_SIZE: 512
  EPOCHS: 100
  LR: 0.2
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
KD:
  TEMPERATURE: 1
  LOSS:
    CE_WEIGHT: 0.5
    KD_WEIGHT: 0.5
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10


================================================
FILE: configs/imagenet/r50_mv1/ofd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "ofd,res50,mobilenetv1"
  PROJECT: "imagenet_baselines"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "OFD"
  TEACHER: "ResNet50"
  STUDENT: "MobileNetV1"
SOLVER:
  BATCH_SIZE: 128
  EPOCHS: 100
  LR: 0.05
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
OFD:
  LOSS:
    FEAT_WEIGHT: 0.0001
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10


================================================
FILE: configs/imagenet/r50_mv1/reviewkd.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "reviewkd,res50,mobilenetv1"
  PROJECT: "imagenet_baselines"
DATASET:
  TYPE: "imagenet"
  NUM_WORKERS: 32
  TEST:
    BATCH_SIZE: 128
DISTILLER:
  TYPE: "REVIEWKD"
  TEACHER: "ResNet50"
  STUDENT: "MobileNetV1"
SOLVER:
  BATCH_SIZE: 256
  EPOCHS: 100
  LR: 0.1
  LR_DECAY_STAGES: [30, 60, 90]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0001
  MOMENTUM: 0.9
  TYPE: "SGD"
LOG:
  TENSORBOARD_FREQ: 50
  SAVE_CHECKPOINT_FREQ: 10
REVIEWKD:
  CE_WEIGHT: 1.0
  REVIEWKD_WEIGHT: 8.0
  WARMUP_EPOCHS: 1
  SHAPES: [1,7,14,28,56]
  OUT_SHAPES: [1,7,14,28,56]
  IN_CHANNELS: [128,256,512,1024,1024]
  OUT_CHANNELS: [256,512,1024,2048,2048]
  MAX_MID_CHANNEL: 256
  STU_PREACT: True


================================================
FILE: configs/tiny_imagenet/dot/r18_mv2.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "kd,dot,r18,mv2"
  PROJECT: "dot_tinyimagenet"
DATASET:
  TYPE: "tiny_imagenet"
  NUM_WORKERS: 16
DISTILLER:
  TYPE: "KD"
  TEACHER: "ResNet18"
  STUDENT: "MobileNetV2"
SOLVER:
  BATCH_SIZE: 256
  EPOCHS: 200
  LR: 0.2
  LR_DECAY_STAGES: [60, 120, 160]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
  TRAINER: "dot"
  DOT:
    DELTA: 0.075


================================================
FILE: configs/tiny_imagenet/dot/r18_shuv2.yaml
================================================
EXPERIMENT:
  NAME: ""
  TAG: "kd,dot,r18,shuv2"
  PROJECT: "dot_tinyimagenet"
DATASET:
  TYPE: "tiny_imagenet"
  NUM_WORKERS: 16
DISTILLER:
  TYPE: "KD"
  TEACHER: "ResNet18"
  STUDENT: "ShuffleV2"
SOLVER:
  BATCH_SIZE: 256
  EPOCHS: 200
  LR: 0.2
  LR_DECAY_STAGES: [60, 120, 160]
  LR_DECAY_RATE: 0.1
  WEIGHT_DECAY: 0.0005
  MOMENTUM: 0.9
  TYPE: "SGD"
  TRAINER: "dot"
  DOT:
    DELTA: 0.075


================================================
FILE: detection/README.md
================================================
# COCO object detection and instance segmentation

PS: based on the [ReviewKD's codebase](https://github.com/dvlab-research/ReviewKD).

## Environment

* 4 GPUs
* python 3.6
* torch 1.9.0
* torchvision 0.10.0

## Installation

Our code is based on Detectron2, please install Detectron2 refer to https://github.com/facebookresearch/detectron2.

Please put the [COCO](https://cocodataset.org/#download) dataset in datasets/.

Please put the pretrained weights for teacher and student in pretrained/. You can find the pretrained weights [here](https://github.com/dvlab-research/ReviewKD/releases/). The pretrained models we provided contains both teacher's and student's weights. The teacher's weights come from Detectron2's pretrained detector. The student's weights are ImageNet pretrained weights.

## Training

```
# Tea: R-101, Stu: R-18
python3 train_net.py --config-file configs/DKD/DKD-R18-R101.yaml --num-gpus 4

# Tea: R-101, Stu: R-50
python3 train_net.py --config-file configs/DKD/DKD-R50-R101.yaml --num-gpus 4

# Tea: R-50, Stu: MV2
python3 train_net.py --config-file configs/DKD/DKD-MV2-R50.yaml --num-gpus 4

```


================================================
FILE: detection/__init__.py
================================================


================================================
FILE: detection/configs/Base-Distillation.yaml
================================================
MODEL:
  META_ARCHITECTURE: "RCNNKD"
  BACKBONE:
    NAME: "build_resnet_fpn_backbone_kd"
  RESNETS:
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
  FPN:
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
  ANCHOR_GENERATOR:
    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
  RPN:
    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
    # Detectron1 uses 2000 proposals per-batch,
    # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
    # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
    POST_NMS_TOPK_TRAIN: 1000
    POST_NMS_TOPK_TEST: 1000
  ROI_HEADS:
    NAME: "StandardROIHeads"
    IN_FEATURES: ["p2", "p3", "p4", "p5"]
  ROI_BOX_HEAD:
    NAME: "FastRCNNConvFCHead"
    NUM_FC: 2
    POOLER_RESOLUTION: 7
  ROI_MASK_HEAD:
    NAME: "MaskRCNNConvUpsampleHead"
    NUM_CONV: 4
    POOLER_RESOLUTION: 14

TEACHER:
  MODEL:
    META_ARCHITECTURE: "GeneralizedRCNN"
    BACKBONE:
      NAME: "build_resnet_fpn_backbone"
    RESNETS:
      OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    FPN:
      IN_FEATURES: ["res2", "res3", "res4", "res5"]
    ANCHOR_GENERATOR:
      SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
      ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
    RPN:
      IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
      PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
      PRE_NMS_TOPK_TEST: 1000  # Per FPN level
      # Detectron1 uses 2000 proposals per-batch,
      # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
      # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
      POST_NMS_TOPK_TRAIN: 1000
      POST_NMS_TOPK_TEST: 1000
    ROI_HEADS:
      NAME: "StandardROIHeads"
      IN_FEATURES: ["p2", "p3", "p4", "p5"]
    ROI_BOX_HEAD:
      NAME: "FastRCNNConvFCHead"
      NUM_FC: 2
      POOLER_RESOLUTION: 7
    ROI_MASK_HEAD:
      NAME: "MaskRCNNConvUpsampleHead"
      NUM_CONV: 4
      POOLER_RESOLUTION: 14


DATASETS:
  TRAIN: ("coco_2017_train",)
  TEST: ("coco_2017_val",)
SOLVER:
  IMS_PER_BATCH: 16
  BASE_LR: 0.02
  STEPS: (60000, 80000)
  MAX_ITER: 90000
INPUT:
  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2


================================================
FILE: detection/configs/DKD/DKD-MV2-R50.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/DKD-MV2-R50
MODEL:
  BACKBONE:
    NAME: "build_mobilenetv2_fpn_backbone"
    FREEZE_AT: 0
  WEIGHTS: "pretrained/mv2-r50.pth"
  MOBILENETV2:
    OUT_FEATURES: ["m2", "m3", "m4", "m5"]
  FPN:
    IN_FEATURES: ["m2", "m3", "m4", "m5"]
  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 50
KD:
  TYPE: "DKD"

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/configs/DKD/DKD-R18-R101.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/DKD-R18-R101
INPUT:
  FORMAT: 'RGB'
MODEL:
  PIXEL_STD: [57.375, 57.120, 58.395]
  BACKBONE:
    NAME: "build_resnet_fpn_backbone_kd"
  WEIGHTS: "pretrained/r18-r101.pth"
  RESNETS:
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    DEPTH: 18
    RES2_OUT_CHANNELS: 64
  FPN:
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 101
KD:
  TYPE: "DKD"

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/configs/DKD/DKD-R50-R101.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/DKD-R50-R101
MODEL:
  BACKBONE:
    NAME: "build_resnet_fpn_backbone_kd"
  WEIGHTS: "pretrained/r50-r101.pth"
  RESNETS:
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    DEPTH: 50
  FPN:
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 101
KD:
  TYPE: "DKD"

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/configs/DKD/ReviewDKD-MV2-R50.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/ReviewDKD-MV2-R50
MODEL:
  BACKBONE:
    NAME: "build_mobilenetv2_fpn_backbone"
    FREEZE_AT: 0
  WEIGHTS: "pretrained/mv2-r50.pth"
  MOBILENETV2:
    OUT_FEATURES: ["m2", "m3", "m4", "m5"]
  FPN:
    IN_FEATURES: ["m2", "m3", "m4", "m5"]

  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 50
KD:
  TYPE: "ReviewDKD"
  REVIEWKD:
    LOSS_WEIGHT: 2.0

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/configs/DKD/ReviewDKD-R18-R101.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/ReviewDKD-R18-R101
INPUT:
  FORMAT: 'RGB'
MODEL:
  PIXEL_STD: [57.375, 57.120, 58.395]
  BACKBONE:
    NAME: "build_resnet_fpn_backbone_kd"
  WEIGHTS: "pretrained/r18-r101.pth"
  RESNETS:
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    DEPTH: 18
    RES2_OUT_CHANNELS: 64
  FPN:
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 101
KD:
  TYPE: "ReviewDKD"
  REVIEWKD:
    LOSS_WEIGHT: 1.2

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/configs/DKD/ReviewDKD-R50-R101.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/ReviewDKD-R50-R101
MODEL:
  BACKBONE:
    NAME: "build_resnet_fpn_backbone_kd"
  WEIGHTS: "pretrained/r50-r101.pth"
  RESNETS:
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    DEPTH: 50
  FPN:
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 101
KD:
  TYPE: "ReviewDKD"
  REVIEWKD:
    LOSS_WEIGHT: 1.0

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/configs/ReviewKD/ReviewKD-MV2-R50-Mask.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/ReviewKD-MV2-R50-Mask
MODEL:
  MASK_ON: True
  BACKBONE:
    NAME: "build_mobilenetv2_fpn_backbone"
    FREEZE_AT: 0
  WEIGHTS: "pretrained/mv2-r50mask.pth"
  MOBILENETV2:
    OUT_FEATURES: ["m2", "m3", "m4", "m5"]
  FPN:
    IN_FEATURES: ["m2", "m3", "m4", "m5"]
  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 50
KD:
  TYPE: "ReviewKD"
  REVIEWKD:
    LOSS_WEIGHT: 1.0

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/configs/ReviewKD/ReviewKD-MV2-R50.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/ReviewKD-MV2-R50
MODEL:
  BACKBONE:
    NAME: "build_mobilenetv2_fpn_backbone"
    FREEZE_AT: 0
  WEIGHTS: "pretrained/mv2-r50.pth"
  MOBILENETV2:
    OUT_FEATURES: ["m2", "m3", "m4", "m5"]
  FPN:
    IN_FEATURES: ["m2", "m3", "m4", "m5"]

  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 50
KD:
  TYPE: "ReviewKD"
  REVIEWKD:
    LOSS_WEIGHT: 2.0

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/configs/ReviewKD/ReviewKD-R18-R101-Mask.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/ReviewKD-R18-R101-Mask
INPUT:
  FORMAT: 'RGB'
MODEL:
  PIXEL_STD: [57.375, 57.120, 58.395]
  MASK_ON: True
  BACKBONE:
    NAME: "build_resnet_fpn_backbone_kd"
  WEIGHTS: "pretrained/r18-r101mask.pth"
  RESNETS:
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    DEPTH: 18
    RES2_OUT_CHANNELS: 64
  FPN:
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 101
KD:
  TYPE: "ReviewKD"
  REVIEWKD:
    LOSS_WEIGHT: 1.5

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/configs/ReviewKD/ReviewKD-R18-R101.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/ReviewKD-R18-R101
INPUT:
  FORMAT: 'RGB'
MODEL:
  PIXEL_STD: [57.375, 57.120, 58.395]
  BACKBONE:
    NAME: "build_resnet_fpn_backbone_kd"
  WEIGHTS: "pretrained/r18-r101.pth"
  RESNETS:
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    DEPTH: 18
    RES2_OUT_CHANNELS: 64
  FPN:
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 101
KD:
  TYPE: "ReviewKD"
  REVIEWKD:
    LOSS_WEIGHT: 1.2

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/configs/ReviewKD/ReviewKD-R50-R101-Mask.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/ReviewKD-R50-R101-Mask
MODEL:
  MASK_ON: True
  BACKBONE:
    NAME: "build_resnet_fpn_backbone_kd"
  WEIGHTS: "pretrained/r50-r101mask.pth"
  RESNETS:
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    DEPTH: 50
  FPN:
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 101
KD:
  TYPE: "ReviewKD"
  REVIEWKD:
    LOSS_WEIGHT: 0.8

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/configs/ReviewKD/ReviewKD-R50-R101.yaml
================================================
_BASE_: "../Base-Distillation.yaml"
OUTPUT_DIR: output/ReviewKD-R50-R101
MODEL:
  BACKBONE:
    NAME: "build_resnet_fpn_backbone_kd"
  WEIGHTS: "pretrained/r50-r101.pth"
  RESNETS:
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
    DEPTH: 50
  FPN:
    IN_FEATURES: ["res2", "res3", "res4", "res5"]
  PROPOSAL_GENERATOR:
    NAME: "RPN"
  ROI_HEADS:
    NAME: "StandardROIHeads"

TEACHER:
  MODEL:
    RESNETS:
      DEPTH: 101
KD:
  TYPE: "ReviewKD"
  REVIEWKD:
    LOSS_WEIGHT: 1.0

SOLVER:
  IMS_PER_BATCH: 8
  BASE_LR: 0.01
  MAX_ITER: 180000
  STEPS:
  - 120000
  - 160000



================================================
FILE: detection/model/__init__.py
================================================
import torch

from .rcnn import RCNNKD
from .config import add_distillation_cfg
from .backbone import build_resnet_fpn_backbone_kd


================================================
FILE: detection/model/backbone/__init__.py
================================================
from .resnet import build_resnet_backbone_kd
from .fpn import build_resnet_fpn_backbone_kd, build_mobilenetv2_fpn_backbone


================================================
FILE: detection/model/backbone/fpn.py
================================================
from .resnet import build_resnet_backbone_kd
from .mobilenetv2 import build_mobilenetv2_backbone
from detectron2.modeling.backbone import BACKBONE_REGISTRY, FPN
from detectron2.modeling.backbone.fpn import LastLevelMaxPool
from detectron2.layers import Conv2d, ShapeSpec, get_norm



@BACKBONE_REGISTRY.register()
def build_resnet_fpn_backbone_kd(cfg, input_shape: ShapeSpec):
    # add ResNet 18
    """
    Args:
        cfg: a detectron2 CfgNode

    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = build_resnet_backbone_kd(cfg, input_shape)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
    backbone = FPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        norm=cfg.MODEL.FPN.NORM,
        top_block=LastLevelMaxPool(),
        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
    )
    return backbone

@BACKBONE_REGISTRY.register()
def build_mobilenetv2_fpn_backbone(cfg, input_shape: ShapeSpec):
    """
    Args:
        cfg: a detectron2 CfgNode

    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    bottom_up = build_mobilenetv2_backbone(cfg, input_shape)
    in_features = cfg.MODEL.FPN.IN_FEATURES
    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
    backbone = FPN(
        bottom_up=bottom_up,
        in_features=in_features,
        out_channels=out_channels,
        norm=cfg.MODEL.FPN.NORM,
        top_block=LastLevelMaxPool(),
        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
    )
    return backbone



================================================
FILE: detection/model/backbone/mobilenetv2.py
================================================
"""
Creates a MobileNetV2 Model as defined in:
Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen. (2018). 
MobileNetV2: Inverted Residuals and Linear Bottlenecks
arXiv preprint arXiv:1801.04381.
import from https://github.com/tonylins/pytorch-mobilenet-v2
"""

import torch.nn as nn
import math
from detectron2.modeling.backbone import BACKBONE_REGISTRY
from detectron2.modeling.backbone import Backbone, FPN
from detectron2.modeling.backbone.fpn import LastLevelMaxPool

from detectron2.layers import (
    Conv2d,
    DeformConv,
    FrozenBatchNorm2d,
    ModulatedDeformConv,
    ShapeSpec,
    get_norm,
)

__all__ = ['mobilenetv2']


def _make_divisible(v, divisor, min_value=None):
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    :param v:
    :param divisor:
    :param min_value:
    :return:
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


def conv_3x3_bn(inp, oup, stride, bn):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        get_norm(bn, oup),
        nn.ReLU6(inplace=True)
    )


def conv_1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        get_norm(bn, oup),
        nn.ReLU6(inplace=True)
    )


class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio, bn):
        super(InvertedResidual, self).__init__()
        assert stride in [1, 2]

        hidden_dim = round(inp * expand_ratio)
        self.identity = stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                get_norm(bn, hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                get_norm(bn, oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                get_norm(bn, hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                get_norm(bn, hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                get_norm(bn, oup),
            )

    def forward(self, x):
        if self.identity:
            return x + self.conv(x)
        else:
            return self.conv(x)

    def freeze(self):
        for p in self.parameters():
            p.requires_grad = False
        FrozenBatchNorm2d.convert_frozen_batchnorm(self)
        return self



class MobileNetV2(Backbone):
    def __init__(self, cfg, input_shape, width_mult = 1.):
        super(MobileNetV2, self).__init__()
        self._out_features        = cfg.MODEL.MOBILENETV2.OUT_FEATURES
        bn = cfg.MODEL.MOBILENETV2.NORM
        freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT
        

        # setting of inverted residual blocks
        self.cfgs = [
            # t, c, n, s
            [1,  16, 1, 1, ''],
            [6,  24, 2, 2, 'm2'],
            [6,  32, 3, 2, 'm3'],
            [6,  64, 4, 2, ''],
            [6,  96, 3, 1, 'm4'],
            [6, 160, 3, 2, ''],
            [6, 320, 1, 1, 'm5'],
        ]

        # building first layer
        input_channel = _make_divisible(32 * width_mult, 4 if width_mult == 0.1 else 8)
        layers = [conv_3x3_bn(input_shape.channels, input_channel, 2, bn)]
        if freeze_at >= 1:
            for p in layers[0].parameters():
                p.requires_grad = False
            layers[0] = FrozenBatchNorm2d.convert_frozen_batchnorm(layers[0])
        # building inverted residual blocks
        block = InvertedResidual
        self.stage_name = ['']
        self._out_feature_channels = {}
        self._out_feature_strides = {}
        cur_stride = 2
        cur_stage = 2
        for t, c, n, s, name in self.cfgs:
            output_channel = _make_divisible(c * width_mult, 4 if width_mult == 0.1 else 8)
            cur_stride = cur_stride * s
            for i in range(n):
                layers.append(block(input_channel, output_channel, s if i == 0 else 1, t, bn))
                if cur_stage <= freeze_at :
                    layers[-1].freeze()
                if name != '' and i == n-1:
                    self._out_feature_channels[name] = output_channel
                    self._out_feature_strides[name] = cur_stride
                    cur_stage += 1
                input_channel = output_channel
                self.stage_name.append(name if i == n-1 else '')
        self.features = nn.Sequential(*layers)
        # building last several layers
#        output_channel = _make_divisible(1280 * width_mult, 4 if width_mult == 0.1 else 8) if width_mult > 1.0 else 1280
#        self.conv = conv_1x1_bn(input_channel, output_channel)
#        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
#        self.classifier = nn.Linear(output_channel, num_classes)

        self._initialize_weights()

    def forward(self, x):
        output = {}
        for i in range(len(self.features)):
            x = self.features[i](x)
            if self.stage_name[i] in self._out_features:
                output[self.stage_name[i]] = x
        return output
        '''
        x = self.features(x)
        x = self.conv(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x
        '''

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()
    def output_shape(self):
        return {
            name: ShapeSpec(
                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
            )
            for name in self._out_features
        }


        
@BACKBONE_REGISTRY.register()
def build_mobilenetv2_backbone(cfg, input_shape):
    """
    Constructs a MobileNet V2 model
    """
    return MobileNetV2(cfg, input_shape)





================================================
FILE: detection/model/backbone/resnet.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import numpy as np
import fvcore.nn.weight_init as weight_init
import torch
import torch.nn.functional as F
from torch import nn

from detectron2.layers import (
    Conv2d,
    DeformConv,
    FrozenBatchNorm2d,
    ModulatedDeformConv,
    ShapeSpec,
    get_norm,
)

from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY

__all__ = [
    "ResNetBlockBase",
    "BottleneckBlock",
    "DeformBottleneckBlock",
    "BasicStem",
    "ResNet",
    "make_stage",
    "build_resnet_backbone",
]


class ResNetBlockBase(nn.Module):
    def __init__(self, in_channels, out_channels, stride):
        """
        The `__init__` method of any subclass should also contain these arguments.

        Args:
            in_channels (int):
            out_channels (int):
            stride (int):
        """
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride = stride

    def freeze(self):
        for p in self.parameters():
            p.requires_grad = False
        FrozenBatchNorm2d.convert_frozen_batchnorm(self)
        return self


class BasicBlock(ResNetBlockBase):
    def __init__(
        self,
        in_channels,
        out_channels,
        *,
        bottleneck_channels,
        stride=1,
        num_groups=1,
        norm="BN",
        stride_in_1x1=False,
        dilation=1,
    ):
        super().__init__(in_channels, out_channels, stride)
        if in_channels != out_channels:
            self.shortcut = Conv2d(
                in_channels,
                out_channels,
                kernel_size=1,
                stride=stride,
                bias=False,
                norm=get_norm(norm, out_channels),
            )
        else:
            self.shortcut = None

        self.conv1 = Conv2d(
            in_channels,
            out_channels,
            kernel_size=3,
            stride=stride,
            padding=1 * dilation,
            bias=False,
            groups=num_groups,
            dilation=dilation,
            norm=get_norm(norm, out_channels),
        )
        self.conv2 = Conv2d(
            out_channels,
            out_channels,
            kernel_size=3,
            stride=1,
            padding=1,
            bias=False,
            groups=num_groups,
            dilation=1,
            norm=get_norm(norm, out_channels),
        )
        for layer in [self.conv1, self.conv2, self.shortcut]:
            if layer is not None:  # shortcut can be None
                weight_init.c2_msra_fill(layer)

    def forward(self, x):
        out = self.conv1(x)
        out = F.relu_(out)

        out = self.conv2(out)

        if self.shortcut is not None:
            shortcut = self.shortcut(x)
        else:
            shortcut = x

        out += shortcut
        out = F.relu_(out)
        return out

class BottleneckBlock(ResNetBlockBase):
    def __init__(
        self,
        in_channels,
        out_channels,
        *,
        bottleneck_channels,
        stride=1,
        num_groups=1,
        norm="BN",
        stride_in_1x1=False,
        dilation=1,
    ):
        """
        Args:
            norm (str or callable): a callable that takes the number of
                channels and return a `nn.Module`, or a pre-defined string
                (one of {"FrozenBN", "BN", "GN"}).
            stride_in_1x1 (bool): when stride==2, whether to put stride in the
                first 1x1 convolution or the bottleneck 3x3 convolution.
        """
        super().__init__(in_channels, out_channels, stride)

        if in_channels != out_channels:
            self.shortcut = Conv2d(
                in_channels,
                out_channels,
                kernel_size=1,
                stride=stride,
                bias=False,
                norm=get_norm(norm, out_channels),
            )
        else:
            self.shortcut = None

        # The original MSRA ResNet models have stride in the first 1x1 conv
        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
        # stride in the 3x3 conv
        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)

        self.conv1 = Conv2d(
            in_channels,
            bottleneck_channels,
            kernel_size=1,
            stride=stride_1x1,
            bias=False,
            norm=get_norm(norm, bottleneck_channels),
        )

        self.conv2 = Conv2d(
            bottleneck_channels,
            bottleneck_channels,
            kernel_size=3,
            stride=stride_3x3,
            padding=1 * dilation,
            bias=False,
            groups=num_groups,
            dilation=dilation,
            norm=get_norm(norm, bottleneck_channels),
        )

        self.conv3 = Conv2d(
            bottleneck_channels,
            out_channels,
            kernel_size=1,
            bias=False,
            norm=get_norm(norm, out_channels),
        )

        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
            if layer is not None:  # shortcut can be None
                weight_init.c2_msra_fill(layer)

        # Zero-initialize the last normalization in each residual branch,
        # so that at the beginning, the residual branch starts with zeros,
        # and each residual block behaves like an identity.
        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
        # "For BN layers, the learnable scaling coefficient γ is initialized
        # to be 1, except for each residual block's last BN
        # where γ is initialized to be 0."

        # nn.init.constant_(self.conv3.norm.weight, 0)
        # TODO this somehow hurts performance when training GN models from scratch.
        # Add it as an option when we need to use this code to train a backbone.

    def forward(self, x):
        out = self.conv1(x)
        out = F.relu_(out)

        out = self.conv2(out)
        out = F.relu_(out)

        out = self.conv3(out)

        if self.shortcut is not None:
            shortcut = self.shortcut(x)
        else:
            shortcut = x

        out += shortcut
        out = F.relu_(out)
        return out


class DeformBottleneckBlock(ResNetBlockBase):
    def __init__(
        self,
        in_channels,
        out_channels,
        *,
        bottleneck_channels,
        stride=1,
        num_groups=1,
        norm="BN",
        stride_in_1x1=False,
        dilation=1,
        deform_modulated=False,
        deform_num_groups=1,
    ):
        """
        Similar to :class:`BottleneckBlock`, but with deformable conv in the 3x3 convolution.
        """
        super().__init__(in_channels, out_channels, stride)
        self.deform_modulated = deform_modulated

        if in_channels != out_channels:
            self.shortcut = Conv2d(
                in_channels,
                out_channels,
                kernel_size=1,
                stride=stride,
                bias=False,
                norm=get_norm(norm, out_channels),
            )
        else:
            self.shortcut = None

        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)

        self.conv1 = Conv2d(
            in_channels,
            bottleneck_channels,
            kernel_size=1,
            stride=stride_1x1,
            bias=False,
            norm=get_norm(norm, bottleneck_channels),
        )

        if deform_modulated:
            deform_conv_op = ModulatedDeformConv
            # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size
            offset_channels = 27
        else:
            deform_conv_op = DeformConv
            offset_channels = 18

        self.conv2_offset = Conv2d(
            bottleneck_channels,
            offset_channels * deform_num_groups,
            kernel_size=3,
            stride=stride_3x3,
            padding=1 * dilation,
            dilation=dilation,
        )
        self.conv2 = deform_conv_op(
            bottleneck_channels,
            bottleneck_channels,
            kernel_size=3,
            stride=stride_3x3,
            padding=1 * dilation,
            bias=False,
            groups=num_groups,
            dilation=dilation,
            deformable_groups=deform_num_groups,
            norm=get_norm(norm, bottleneck_channels),
        )

        self.conv3 = Conv2d(
            bottleneck_channels,
            out_channels,
            kernel_size=1,
            bias=False,
            norm=get_norm(norm, out_channels),
        )

        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
            if layer is not None:  # shortcut can be None
                weight_init.c2_msra_fill(layer)

        nn.init.constant_(self.conv2_offset.weight, 0)
        nn.init.constant_(self.conv2_offset.bias, 0)

    def forward(self, x):
        out = self.conv1(x)
        out = F.relu_(out)

        if self.deform_modulated:
            offset_mask = self.conv2_offset(out)
            offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
            offset = torch.cat((offset_x, offset_y), dim=1)
            mask = mask.sigmoid()
            out = self.conv2(out, offset, mask)
        else:
            offset = self.conv2_offset(out)
            out = self.conv2(out, offset)
        out = F.relu_(out)

        out = self.conv3(out)

        if self.shortcut is not None:
            shortcut = self.shortcut(x)
        else:
            shortcut = x

        out += shortcut
        out = F.relu_(out)
        return out


def make_stage(block_class, num_blocks, first_stride, **kwargs):
    """
    Create a resnet stage by creating many blocks.

    Args:
        block_class (class): a subclass of ResNetBlockBase
        num_blocks (int):
        first_stride (int): the stride of the first block. The other blocks will have stride=1.
            A `stride` argument will be passed to the block constructor.
        kwargs: other arguments passed to the block constructor.

    Returns:
        list[nn.Module]: a list of block module.
    """
    blocks = []
    for i in range(num_blocks):
        blocks.append(block_class(stride=first_stride if i == 0 else 1, **kwargs))
        kwargs["in_channels"] = kwargs["out_channels"]
    return blocks


class BasicStem(nn.Module):
    def __init__(self, in_channels=3, out_channels=64, norm="BN"):
        """
        Args:
            norm (str or callable): a callable that takes the number of
                channels and return a `nn.Module`, or a pre-defined string
                (one of {"FrozenBN", "BN", "GN"}).
        """
        super().__init__()
        self.conv1 = Conv2d(
            in_channels,
            out_channels,
            kernel_size=7,
            stride=2,
            padding=3,
            bias=False,
            norm=get_norm(norm, out_channels),
        )
        weight_init.c2_msra_fill(self.conv1)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu_(x)
        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
        return x

    @property
    def out_channels(self):
        return self.conv1.out_channels

    @property
    def stride(self):
        return 4  # = stride 2 conv -> stride 2 max pool


class ResNet(Backbone):
    def __init__(self, stem, stages, num_classes=None, out_features=None):
        """
        Args:
            stem (nn.Module): a stem module
            stages (list[list[ResNetBlock]]): several (typically 4) stages,
                each contains multiple :class:`ResNetBlockBase`.
            num_classes (None or int): if None, will not perform classification.
            out_features (list[str]): name of the layers whose outputs should
                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
                If None, will return the output of the last layer.
        """
        super(ResNet, self).__init__()
        self.stem = stem
        self.num_classes = num_classes

        current_stride = self.stem.stride
        self._out_feature_strides = {"stem": current_stride}
        self._out_feature_channels = {"stem": self.stem.out_channels}

        self.stages_and_names = []
        for i, blocks in enumerate(stages):
            for block in blocks:
                assert isinstance(block, ResNetBlockBase), block
                curr_channels = block.out_channels
            stage = nn.Sequential(*blocks)
            name = "res" + str(i + 2)
            self.add_module(name, stage)
            self.stages_and_names.append((stage, name))
            self._out_feature_strides[name] = current_stride = int(
                current_stride * np.prod([k.stride for k in blocks])
            )
            self._out_feature_channels[name] = blocks[-1].out_channels

        if num_classes is not None:
            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
            self.linear = nn.Linear(curr_channels, num_classes)

            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
            # "The 1000-way fully-connected layer is initialized by
            # drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
            nn.init.normal_(self.linear.weight, std=0.01)
            name = "linear"

        if out_features is None:
            out_features = [name]
        self._out_features = out_features
        assert len(self._out_features)
        children = [x[0] for x in self.named_children()]
        for out_feature in self._out_features:
            assert out_feature in children, "Available children: {}".format(", ".join(children))

    def forward(self, x):
        outputs = {}
        x = self.stem(x)
        if "stem" in self._out_features:
            outputs["stem"] = x
        for stage, name in self.stages_and_names:
            x = stage(x)
            if name in self._out_features:
                outputs[name] = x
        if self.num_classes is not None:
            x = self.avgpool(x)
            x = torch.flatten(x, 1)
            x = self.linear(x)
            if "linear" in self._out_features:
                outputs["linear"] = x
        return outputs

    def output_shape(self):
        return {
            name: ShapeSpec(
                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
            )
            for name in self._out_features
        }


@BACKBONE_REGISTRY.register()
def build_resnet_backbone_kd(cfg, input_shape):
    # add ResNet18 setting
    """
    Create a ResNet instance from config.

    Returns:
        ResNet: a :class:`ResNet` instance.
    """
    # need registration of new blocks/stems?
    norm = cfg.MODEL.RESNETS.NORM
    stem = BasicStem(
        in_channels=input_shape.channels,
        out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
        norm=norm,
    )
    freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT

    if freeze_at >= 1:
        for p in stem.parameters():
            p.requires_grad = False
        stem = FrozenBatchNorm2d.convert_frozen_batchnorm(stem)

    # fmt: off
    out_features        = cfg.MODEL.RESNETS.OUT_FEATURES
    depth               = cfg.MODEL.RESNETS.DEPTH
    num_groups          = cfg.MODEL.RESNETS.NUM_GROUPS
    width_per_group     = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
    bottleneck_channels = num_groups * width_per_group
    in_channels         = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
    out_channels        = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
    stride_in_1x1       = cfg.MODEL.RESNETS.STRIDE_IN_1X1
    res5_dilation       = cfg.MODEL.RESNETS.RES5_DILATION
    deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
    deform_modulated    = cfg.MODEL.RESNETS.DEFORM_MODULATED
    deform_num_groups   = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
    # fmt: on
    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)

    num_blocks_per_stage = {18:[2,2,2,2], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}[depth]

    stages = []

    # Avoid creating variables without gradients
    # It consumes extra memory and may cause allreduce to fail
    out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features]
    max_stage_idx = max(out_stage_idx)
    for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)):
        dilation = res5_dilation if stage_idx == 5 else 1
        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
        stage_kargs = {
            "num_blocks": num_blocks_per_stage[idx],
            "first_stride": first_stride,
            "in_channels": in_channels,
            "bottleneck_channels": bottleneck_channels,
            "out_channels": out_channels,
            "num_groups": num_groups,
            "norm": norm,
            "stride_in_1x1": stride_in_1x1,
            "dilation": dilation,
        }
        if depth < 50:
            stage_kargs["block_class"] = BasicBlock
        elif deform_on_per_stage[idx]:
            stage_kargs["block_class"] = DeformBottleneckBlock
            stage_kargs["deform_modulated"] = deform_modulated
            stage_kargs["deform_num_groups"] = deform_num_groups
        else:
            stage_kargs["block_class"] = BottleneckBlock
        blocks = make_stage(**stage_kargs)
        in_channels = out_channels
        out_channels *= 2
        bottleneck_channels *= 2

        if freeze_at >= stage_idx:
            for block in blocks:
                block.freeze()
        stages.append(blocks)
    return ResNet(stem, stages, out_features=out_features)


================================================
FILE: detection/model/config.py
================================================
from detectron2.config import CfgNode as CN
import numpy as np

def add_distillation_cfg(cfg):
    cfg.MODEL.MOBILENETV2 = CN()

    # Debug
    cfg.MODEL.MOBILENETV2.DEBUG = 0

    cfg.MODEL.MOBILENETV2.OUT_FEATURES = ['m2']
    cfg.MODEL.MOBILENETV2.NORM = 'FrozenBN'

    cfg.KD = CN()
    cfg.KD.TYPE = "DKD" # ("DKD", "ReviewKD")

    # DKD
    cfg.KD.DKD = CN()
    cfg.KD.DKD.ALPHA = 1.0
    cfg.KD.DKD.BETA = 0.25
    cfg.KD.DKD.T = 1.0

    # REVIEWKD
    cfg.KD.REVIEWKD = CN()
    cfg.KD.REVIEWKD.LOSS_WEIGHT = 1.0

    add_teacher_cfg(cfg)
    

def add_teacher_cfg(cfg):
    cfg.TEACHER = CN()
    cfg.TEACHER.KD = CN()
    cfg.TEACHER.KD.FEATURE_KD_MASK = 'None' # fine_grained_mask, gt_box_mask
    cfg.TEACHER.MODEL = CN()
    cfg.TEACHER.MODEL.LOAD_PROPOSALS = False
    cfg.TEACHER.MODEL.MASK_ON = False
    cfg.TEACHER.MODEL.KEYPOINT_ON = False
    cfg.TEACHER.MODEL.DEVICE = "cuda"
    cfg.TEACHER.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
    
    # Path (a file path, or URL like detectron2://.., https://..) to a checkpoint file
    # to be loaded to the model. You can find available models in the model zoo.
    cfg.TEACHER.MODEL.WEIGHTS = ""
    
    # Values to be used for image normalization (BGR order, since INPUT.FORMAT defaults to BGR).
    # To train on images of different number of channels, just set different mean & std.
    # Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
    cfg.TEACHER.MODEL.PIXEL_MEAN = [103.530, 116.280, 123.675]
    # When using pre-trained models in Detectron1 or any MSRA models,
    # std has been absorbed into its conv1 weights, so the std needs to be set 1.
    # Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
    cfg.TEACHER.MODEL.PIXEL_STD = [1.0, 1.0, 1.0]
    
    
    # -----------------------------------------------------------------------------
    # INPUT
    # -----------------------------------------------------------------------------
    cfg.TEACHER.INPUT = CN()
    # Size of the smallest side of the image during training
    cfg.TEACHER.INPUT.MIN_SIZE_TRAIN = (800,)
    # Sample size of smallest side by choice or random selection from range give by
    # INPUT.MIN_SIZE_TRAIN
    cfg.TEACHER.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice"
    # Maximum size of the side of the image during training
    cfg.TEACHER.INPUT.MAX_SIZE_TRAIN = 1333
    # Size of the smallest side of the image during testing. Set to zero to disable resize in testing.
    cfg.TEACHER.INPUT.MIN_SIZE_TEST = 800
    # Maximum size of the side of the image during testing
    cfg.TEACHER.INPUT.MAX_SIZE_TEST = 1333
    # Mode for flipping images used in data augmentation during training
    # choose one of ["horizontal, "vertical", "none"]
    cfg.TEACHER.INPUT.RANDOM_FLIP = "horizontal"
    
    # `True` if cropping is used for data augmentation during training
    cfg.TEACHER.INPUT.CROP = CN({"ENABLED": False})
    # Cropping type. See documentation of `detectron2.data.transforms.RandomCrop` for explanation.
    cfg.TEACHER.INPUT.CROP.TYPE = "relative_range"
    # Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of
    # pixels if CROP.TYPE is "absolute"
    cfg.TEACHER.INPUT.CROP.SIZE = [0.9, 0.9]
    
    
    # Whether the model needs RGB, YUV, HSV etc.
    # Should be one of the modes defined here, as we use PIL to read the image:
    # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes
    # with BGR being the one exception. One can set image format to BGR, we will
    # internally use RGB for conversion and flip the channels over
    cfg.TEACHER.INPUT.FORMAT = "BGR"
    # The ground truth mask format that the model will use.
    # Mask R-CNN supports either "polygon" or "bitmask" as ground truth.
    cfg.TEACHER.INPUT.MASK_FORMAT = "polygon"  # alternative: "bitmask"
    
    
    # -----------------------------------------------------------------------------
    # Dataset
    # -----------------------------------------------------------------------------
    cfg.TEACHER.DATASETS = CN()
    # List of the dataset names for training. Must be registered in DatasetCatalog
    # Samples from these datasets will be merged and used as one dataset.
    cfg.TEACHER.DATASETS.TRAIN = ()
    # List of the pre-computed proposal files for training, which must be consistent
    # with datasets listed in DATASETS.TRAIN.
    cfg.TEACHER.DATASETS.PROPOSAL_FILES_TRAIN = ()
    # Number of top scoring precomputed proposals to keep for training
    cfg.TEACHER.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN = 2000
    # List of the dataset names for testing. Must be registered in DatasetCatalog
    cfg.TEACHER.DATASETS.TEST = ()
    # List of the pre-computed proposal files for test, which must be consistent
    # with datasets listed in DATASETS.TEST.
    cfg.TEACHER.DATASETS.PROPOSAL_FILES_TEST = ()
    # Number of top scoring precomputed proposals to keep for test
    cfg.TEACHER.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST = 1000
    
    # -----------------------------------------------------------------------------
    # DataLoader
    # -----------------------------------------------------------------------------
    cfg.TEACHER.DATALOADER = CN()
    # Number of data loading threads
    cfg.TEACHER.DATALOADER.NUM_WORKERS = 4
    # If True, each batch should contain only images for which the aspect ratio
    # is compatible. This groups portrait images together, and landscape images
    # are not batched with portrait images.
    cfg.TEACHER.DATALOADER.ASPECT_RATIO_GROUPING = True
    # Options: TrainingSampler, RepeatFactorTrainingSampler
    cfg.TEACHER.DATALOADER.SAMPLER_TRAIN = "TrainingSampler"
    # Repeat threshold for RepeatFactorTrainingSampler
    cfg.TEACHER.DATALOADER.REPEAT_THRESHOLD = 0.0
    # Tf True, when working on datasets that have instance annotations, the
    # training dataloader will filter out images without associated annotations
    cfg.TEACHER.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True
    
    # ---------------------------------------------------------------------------- #
    # Backbone options
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.BACKBONE = CN()
    
    cfg.TEACHER.MODEL.BACKBONE.NAME = "build_resnet_backbone"
    # Freeze the first several stages so they are not trained.
    # There are 5 stages in ResNet. The first is a convolution, and the following
    # stages are each group of residual blocks.
    cfg.TEACHER.MODEL.BACKBONE.FREEZE_AT = 2
    
    
    # ---------------------------------------------------------------------------- #
    # FPN options
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.FPN = CN()
    # Names of the input feature maps to be used by FPN
    # They must have contiguous power of 2 strides
    # e.g., ["res2", "res3", "res4", "res5"]
    cfg.TEACHER.MODEL.FPN.IN_FEATURES = []
    cfg.TEACHER.MODEL.FPN.OUT_CHANNELS = 256
    
    # Options: "" (no norm), "GN"
    cfg.TEACHER.MODEL.FPN.NORM = ""
    
    # Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg"
    cfg.TEACHER.MODEL.FPN.FUSE_TYPE = "sum"
    
    
    # ---------------------------------------------------------------------------- #
    # Proposal generator options
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.PROPOSAL_GENERATOR = CN()
    # Current proposal generators include "RPN", "RRPN" and "PrecomputedProposals"
    cfg.TEACHER.MODEL.PROPOSAL_GENERATOR.NAME = "RPN"
    # Proposal height and width both need to be greater than MIN_SIZE
    # (a the scale used during training or inference)
    cfg.TEACHER.MODEL.PROPOSAL_GENERATOR.MIN_SIZE = 0
    
    
    # ---------------------------------------------------------------------------- #
    # Anchor generator options
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.ANCHOR_GENERATOR = CN()
    # The generator can be any name in the ANCHOR_GENERATOR registry
    cfg.TEACHER.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator"
    # Anchor sizes (i.e. sqrt of area) in absolute pixels w.r.t. the network input.
    # Format: list[list[float]]. SIZES[i] specifies the list of sizes to use for
    # IN_FEATURES[i]; len(SIZES) must be equal to len(IN_FEATURES) or 1.
    # When len(SIZES) == 1, SIZES[0] is used for all IN_FEATURES.
    cfg.TEACHER.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]]
    # Anchor aspect ratios. For each area given in `SIZES`, anchors with different aspect
    # ratios are generated by an anchor generator.
    # Format: list[list[float]]. ASPECT_RATIOS[i] specifies the list of aspect ratios (H/W)
    # to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true,
    # or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used
    # for all IN_FEATURES.
    cfg.TEACHER.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]]
    # Anchor angles.
    # list[list[float]], the angle in degrees, for each input feature map.
    # ANGLES[i] specifies the list of angles for IN_FEATURES[i].
    cfg.TEACHER.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]]
    # Relative offset between the center of the first anchor and the top-left corner of the image
    # Value has to be in [0, 1). Recommend to use 0.5, which means half stride.
    # The value is not expected to affect model accuracy.
    cfg.TEACHER.MODEL.ANCHOR_GENERATOR.OFFSET = 0.0
    
    # ---------------------------------------------------------------------------- #
    # RPN options
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.RPN = CN()
    cfg.TEACHER.MODEL.RPN.HEAD_NAME = "StandardRPNHead"  # used by RPN_HEAD_REGISTRY
    
    # Names of the input feature maps to be used by RPN
    # e.g., ["p2", "p3", "p4", "p5", "p6"] for FPN
    cfg.TEACHER.MODEL.RPN.IN_FEATURES = ["res4"]
    # Remove RPN anchors that go outside the image by BOUNDARY_THRESH pixels
    # Set to -1 or a large value, e.g. 100000, to disable pruning anchors
    cfg.TEACHER.MODEL.RPN.BOUNDARY_THRESH = -1
    # IOU overlap ratios [BG_IOU_THRESHOLD, FG_IOU_THRESHOLD]
    # Minimum overlap required between an anchor and ground-truth box for the
    # (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
    # ==> positive RPN example: 1)
    # Maximum overlap allowed between an anchor and ground-truth box for the
    # (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
    # ==> negative RPN example: 0)
    # Anchors with overlap in between (BG_IOU_THRESHOLD <= IoU < FG_IOU_THRESHOLD)
    # are ignored (-1)
    cfg.TEACHER.MODEL.RPN.IOU_THRESHOLDS = [0.3, 0.7]
    cfg.TEACHER.MODEL.RPN.IOU_LABELS = [0, -1, 1]
    # Number of regions per image used to train RPN
    cfg.TEACHER.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
    # Target fraction of foreground (positive) examples per RPN minibatch
    cfg.TEACHER.MODEL.RPN.POSITIVE_FRACTION = 0.5
    # Options are: "smooth_l1", "giou"
    cfg.TEACHER.MODEL.RPN.BBOX_REG_LOSS_TYPE = "smooth_l1"
    cfg.TEACHER.MODEL.RPN.BBOX_REG_LOSS_WEIGHT = 1.0
    # Weights on (dx, dy, dw, dh) for normalizing RPN anchor regression targets
    cfg.TEACHER.MODEL.RPN.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
    # The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
    cfg.TEACHER.MODEL.RPN.SMOOTH_L1_BETA = 0.0
    cfg.TEACHER.MODEL.RPN.LOSS_WEIGHT = 1.0
    # Number of top scoring RPN proposals to keep before applying NMS
    # When FPN is used, this is *per FPN level* (not total)
    cfg.TEACHER.MODEL.RPN.PRE_NMS_TOPK_TRAIN = 12000
    cfg.TEACHER.MODEL.RPN.PRE_NMS_TOPK_TEST = 6000
    # Number of top scoring RPN proposals to keep after applying NMS
    # When FPN is used, this limit is applied per level and then again to the union
    # of proposals from all levels
    # NOTE: When FPN is used, the meaning of this config is different from Detectron1.
    # It means per-batch topk in Detectron1, but per-image topk here.
    # See the "find_top_rpn_proposals" function for details.
    cfg.TEACHER.MODEL.RPN.POST_NMS_TOPK_TRAIN = 2000
    cfg.TEACHER.MODEL.RPN.POST_NMS_TOPK_TEST = 1000
    # NMS threshold used on RPN proposals
    cfg.TEACHER.MODEL.RPN.NMS_THRESH = 0.7
    # Set this to -1 to use the same number of output channels as input channels.
    cfg.TEACHER.MODEL.RPN.CONV_DIMS = [-1]
    
    # ---------------------------------------------------------------------------- #
    # ROI HEADS options
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.ROI_HEADS = CN()
    cfg.TEACHER.MODEL.ROI_HEADS.NAME = "Res5ROIHeads"
    # Number of foreground classes
    cfg.TEACHER.MODEL.ROI_HEADS.NUM_CLASSES = 80
    # Names of the input feature maps to be used by ROI heads
    # Currently all heads (box, mask, ...) use the same input feature map list
    # e.g., ["p2", "p3", "p4", "p5"] is commonly used for FPN
    cfg.TEACHER.MODEL.ROI_HEADS.IN_FEATURES = ["res4"]
    # IOU overlap ratios [IOU_THRESHOLD]
    # Overlap threshold for an RoI to be considered background (if < IOU_THRESHOLD)
    # Overlap threshold for an RoI to be considered foreground (if >= IOU_THRESHOLD)
    cfg.TEACHER.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.5]
    cfg.TEACHER.MODEL.ROI_HEADS.IOU_LABELS = [0, 1]
    # RoI minibatch size *per image* (number of regions of interest [ROIs])
    # Total number of RoIs per training minibatch =
    #   ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH
    # E.g., a common configuration is: 512 * 16 = 8192
    cfg.TEACHER.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
    # Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
    cfg.TEACHER.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
    
    # Only used on test mode
    
    # Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
    # balance obtaining high recall with not having too many low precision
    # detections that will slow down inference post processing steps (like NMS)
    # A default threshold of 0.0 increases AP by ~0.2-0.3 but significantly slows down
    # inference.
    cfg.TEACHER.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.05
    # Overlap threshold used for non-maximum suppression (suppress boxes with
    # IoU >= this threshold)
    cfg.TEACHER.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.5
    # If True, augment proposals with ground-truth boxes before sampling proposals to
    # train ROI heads.
    cfg.TEACHER.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT = True
    
    # ---------------------------------------------------------------------------- #
    # Box Head
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.ROI_BOX_HEAD = CN()
    # C4 don't use head name option
    # Options for non-C4 models: FastRCNNConvFCHead,
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.NAME = ""
    # Options are: "smooth_l1", "giou"
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE = "smooth_l1"
    # The final scaling coefficient on the box regression loss, used to balance the magnitude of its
    # gradients with other losses in the model. See also `MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT`.
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT = 1.0
    # Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
    # These are empirically chosen to approximately lead to unit variance targets
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0)
    # The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1.
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA = 0.0
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
    # Type of pooling operation applied to the incoming feature map for each RoI
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2"
    
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.NUM_FC = 0
    # Hidden layer dimension for FC layers in the RoI box head
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.FC_DIM = 1024
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.NUM_CONV = 0
    # Channel dimension for Conv layers in the RoI box head
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.CONV_DIM = 256
    # Normalization method for the convolution layers.
    # Options: "" (no norm), "GN", "SyncBN".
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.NORM = ""
    # Whether to use class agnostic for bbox regression
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG = False
    # If true, RoI heads use bounding boxes predicted by the box head rather than proposal boxes.
    cfg.TEACHER.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES = False
    
    # ---------------------------------------------------------------------------- #
    # Cascaded Box Head
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.ROI_BOX_CASCADE_HEAD = CN()
    # The number of cascade stages is implicitly defined by the length of the following two configs.
    cfg.TEACHER.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = (
        (10.0, 10.0, 5.0, 5.0),
        (20.0, 20.0, 10.0, 10.0),
        (30.0, 30.0, 15.0, 15.0),
    )
    cfg.TEACHER.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.6, 0.7)
    
    
    # ---------------------------------------------------------------------------- #
    # Mask Head
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.ROI_MASK_HEAD = CN()
    cfg.TEACHER.MODEL.ROI_MASK_HEAD.NAME = "MaskRCNNConvUpsampleHead"
    cfg.TEACHER.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
    cfg.TEACHER.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
    cfg.TEACHER.MODEL.ROI_MASK_HEAD.NUM_CONV = 0  # The number of convs in the mask head
    cfg.TEACHER.MODEL.ROI_MASK_HEAD.CONV_DIM = 256
    # Normalization method for the convolution layers.
    # Options: "" (no norm), "GN", "SyncBN".
    cfg.TEACHER.MODEL.ROI_MASK_HEAD.NORM = ""
    # Whether to use class agnostic for mask prediction
    cfg.TEACHER.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK = False
    # Type of pooling operation applied to the incoming feature map for each RoI
    cfg.TEACHER.MODEL.ROI_MASK_HEAD.POOLER_TYPE = "ROIAlignV2"
    
    
    # ---------------------------------------------------------------------------- #
    # Keypoint Head
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.ROI_KEYPOINT_HEAD = CN()
    cfg.TEACHER.MODEL.ROI_KEYPOINT_HEAD.NAME = "KRCNNConvDeconvUpsampleHead"
    cfg.TEACHER.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
    cfg.TEACHER.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
    cfg.TEACHER.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS = tuple(512 for _ in range(8))
    cfg.TEACHER.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 17  # 17 is the number of keypoints in COCO.
    
    # Images with too few (or no) keypoints are excluded from training.
    cfg.TEACHER.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE = 1
    # Normalize by the total number of visible keypoints in the minibatch if True.
    # Otherwise, normalize by the total number of keypoints that could ever exist
    # in the minibatch.
    # The keypoint softmax loss is only calculated on visible keypoints.
    # Since the number of visible keypoints can vary significantly between
    # minibatches, this has the effect of up-weighting the importance of
    # minibatches with few visible keypoints. (Imagine the extreme case of
    # only one visible keypoint versus N: in the case of N, each one
    # contributes 1/N to the gradient compared to the single keypoint
    # determining the gradient direction). Instead, we can normalize the
    # loss by the total number of keypoints, if it were the case that all
    # keypoints were visible in a full minibatch. (Returning to the example,
    # this means that the one visible keypoint contributes as much as each
    # of the N keypoints.)
    cfg.TEACHER.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS = True
    # Multi-task loss weight to use for keypoints
    # Recommended values:
    #   - use 1.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is True
    #   - use 4.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is False
    cfg.TEACHER.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT = 1.0
    # Type of pooling operation applied to the incoming feature map for each RoI
    cfg.TEACHER.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE = "ROIAlignV2"
    
    # ---------------------------------------------------------------------------- #
    # Semantic Segmentation Head
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.SEM_SEG_HEAD = CN()
    cfg.TEACHER.MODEL.SEM_SEG_HEAD.NAME = "SemSegFPNHead"
    cfg.TEACHER.MODEL.SEM_SEG_HEAD.IN_FEATURES = ["p2", "p3", "p4", "p5"]
    # Label in the semantic segmentation ground truth that is ignored, i.e., no loss is calculated for
    # the correposnding pixel.
    cfg.TEACHER.MODEL.SEM_SEG_HEAD.IGNORE_VALUE = 255
    # Number of classes in the semantic segmentation head
    cfg.TEACHER.MODEL.SEM_SEG_HEAD.NUM_CLASSES = 54
    # Number of channels in the 3x3 convs inside semantic-FPN heads.
    cfg.TEACHER.MODEL.SEM_SEG_HEAD.CONVS_DIM = 128
    # Outputs from semantic-FPN heads are up-scaled to the COMMON_STRIDE stride.
    cfg.TEACHER.MODEL.SEM_SEG_HEAD.COMMON_STRIDE = 4
    # Normalization method for the convolution layers. Options: "" (no norm), "GN".
    cfg.TEACHER.MODEL.SEM_SEG_HEAD.NORM = "GN"
    cfg.TEACHER.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT = 1.0
    
    cfg.TEACHER.MODEL.PANOPTIC_FPN = CN()
    # Scaling of all losses from instance detection / segmentation head.
    cfg.TEACHER.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT = 1.0
    
    # options when combining instance & semantic segmentation outputs
    cfg.TEACHER.MODEL.PANOPTIC_FPN.COMBINE = CN({"ENABLED": True})  # "COMBINE.ENABLED" is deprecated & not used
    cfg.TEACHER.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH = 0.5
    cfg.TEACHER.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT = 4096
    cfg.TEACHER.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.5
    
    
    # ---------------------------------------------------------------------------- #
    # RetinaNet Head
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.RETINANET = CN()
    
    # This is the number of foreground classes.
    cfg.TEACHER.MODEL.RETINANET.NUM_CLASSES = 80
    
    cfg.TEACHER.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
    
    # Convolutions to use in the cls and bbox tower
    # NOTE: this doesn't include the last conv for logits
    cfg.TEACHER.MODEL.RETINANET.NUM_CONVS = 4
    
    # IoU overlap ratio [bg, fg] for labeling anchors.
    # Anchors with < bg are labeled negative (0)
    # Anchors  with >= bg and < fg are ignored (-1)
    # Anchors with >= fg are labeled positive (1)
    cfg.TEACHER.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5]
    cfg.TEACHER.MODEL.RETINANET.IOU_LABELS = [0, -1, 1]
    
    # Prior prob for rare case (i.e. foreground) at the beginning of training.
    # This is used to set the bias for the logits layer of the classifier subnet.
    # This improves training stability in the case of heavy class imbalance.
    cfg.TEACHER.MODEL.RETINANET.PRIOR_PROB = 0.01
    
    # Inference cls score threshold, only anchors with score > INFERENCE_TH are
    # considered for inference (to improve speed)
    cfg.TEACHER.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05
    # Select topk candidates before NMS
    cfg.TEACHER.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000
    cfg.TEACHER.MODEL.RETINANET.NMS_THRESH_TEST = 0.5
    
    # Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets
    cfg.TEACHER.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
    
    # Loss parameters
    cfg.TEACHER.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0
    cfg.TEACHER.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25
    cfg.TEACHER.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1
    # Options are: "smooth_l1", "giou"
    cfg.TEACHER.MODEL.RETINANET.BBOX_REG_LOSS_TYPE = "smooth_l1"
    
    # One of BN, SyncBN, FrozenBN, GN
    # Only supports GN until unshared norm is implemented
    cfg.TEACHER.MODEL.RETINANET.NORM = ""
    
    
    # ---------------------------------------------------------------------------- #
    # ResNe[X]t options (ResNets = {ResNet, ResNeXt}
    # Note that parts of a resnet may be used for both the backbone and the head
    # These options apply to both
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.MODEL.RESNETS = CN()
    
    cfg.TEACHER.MODEL.RESNETS.DEPTH = 50
    cfg.TEACHER.MODEL.RESNETS.OUT_FEATURES = ["res4"]  # res4 for C4 backbone, res2..5 for FPN backbone
    
    # Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
    cfg.TEACHER.MODEL.RESNETS.NUM_GROUPS = 1
    
    # Options: FrozenBN, GN, "SyncBN", "BN"
    cfg.TEACHER.MODEL.RESNETS.NORM = "FrozenBN"
    
    # Baseline width of each group.
    # Scaling this parameters will scale the width of all bottleneck layers.
    cfg.TEACHER.MODEL.RESNETS.WIDTH_PER_GROUP = 64
    
    # Place the stride 2 conv on the 1x1 filter
    # Use True only for the original MSRA ResNet; use False for C2 and Torch models
    cfg.TEACHER.MODEL.RESNETS.STRIDE_IN_1X1 = True
    
    # Apply dilation in stage "res5"
    cfg.TEACHER.MODEL.RESNETS.RES5_DILATION = 1
    
    # Output width of res2. Scaling this parameters will scale the width of all 1x1 convs in ResNet
    # For R18 and R34, this needs to be set to 64
    cfg.TEACHER.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
    cfg.TEACHER.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
    
    # Apply Deformable Convolution in stages
    # Specify if apply deform_conv on Res2, Res3, Res4, Res5
    cfg.TEACHER.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False]
    # Use True to use modulated deform_conv (DeformableV2, https://arxiv.org/abs/1811.11168);
    # Use False for DeformableV1.
    cfg.TEACHER.MODEL.RESNETS.DEFORM_MODULATED = False
    # Number of groups in deformable conv.
    cfg.TEACHER.MODEL.RESNETS.DEFORM_NUM_GROUPS = 1
    
    
    # ---------------------------------------------------------------------------- #
    # Solver
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.SOLVER = CN()
    
    # See detectron2/solver/build.py for LR scheduler options
    cfg.TEACHER.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR"
    
    cfg.TEACHER.SOLVER.MAX_ITER = 40000
    
    cfg.TEACHER.SOLVER.BASE_LR = 0.001
    
    cfg.TEACHER.SOLVER.MOMENTUM = 0.9
    
    cfg.TEACHER.SOLVER.NESTEROV = False
    
    cfg.TEACHER.SOLVER.WEIGHT_DECAY = 0.0001
    # The weight decay that's applied to parameters of normalization layers
    # (typically the affine transformation)
    cfg.TEACHER.SOLVER.WEIGHT_DECAY_NORM = 0.0
    
    cfg.TEACHER.SOLVER.GAMMA = 0.1
    # The iteration number to decrease learning rate by GAMMA.
    cfg.TEACHER.SOLVER.STEPS = (30000,)
    
    cfg.TEACHER.SOLVER.WARMUP_FACTOR = 1.0 / 1000
    cfg.TEACHER.SOLVER.WARMUP_ITERS = 1000
    cfg.TEACHER.SOLVER.WARMUP_METHOD = "linear"
    
    # Save a checkpoint after every this number of iterations
    cfg.TEACHER.SOLVER.CHECKPOINT_PERIOD = 5000
    
    # Number of images per batch across all machines. This is also the number
    # of training images per step (i.e. per iteration). If we use 16 GPUs
    # and IMS_PER_BATCH = 32, each GPU will see 2 images per batch.
    # May be adjusted automatically if REFERENCE_WORLD_SIZE is set.
    cfg.TEACHER.SOLVER.IMS_PER_BATCH = 16
    
    # The reference number of workers (GPUs) this config is meant to train with.
    # It takes no effect when set to 0.
    # With a non-zero value, it will be used by DefaultTrainer to compute a desired
    # per-worker batch size, and then scale the other related configs (total batch size,
    # learning rate, etc) to match the per-worker batch size.
    # See documentation of `DefaultTrainer.auto_scale_workers` for details:
    cfg.TEACHER.SOLVER.REFERENCE_WORLD_SIZE = 0
    
    # Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for
    # biases. This is not useful (at least for recent models). You should avoid
    # changing these and they exist only to reproduce Detectron v1 training if
    # desired.
    cfg.TEACHER.SOLVER.BIAS_LR_FACTOR = 1.0
    cfg.TEACHER.SOLVER.WEIGHT_DECAY_BIAS = cfg.TEACHER.SOLVER.WEIGHT_DECAY
    
    # Gradient clipping
    cfg.TEACHER.SOLVER.CLIP_GRADIENTS = CN({"ENABLED": False})
    # Type of gradient clipping, currently 2 values are supported:
    # - "value": the absolute values of elements of each gradients are clipped
    # - "norm": the norm of the gradient for each parameter is clipped thus
    #   affecting all elements in the parameter
    cfg.TEACHER.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "value"
    # Maximum absolute value used for clipping gradients
    cfg.TEACHER.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0
    # Floating point number p for L-p norm to be used with the "norm"
    # gradient clipping type; for L-inf, please specify .inf
    cfg.TEACHER.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
    
    # Enable automatic mixed precision for training
    # Note that this does not change model's inference behavior.
    # To use AMP in inference, run inference under autocast()
    cfg.TEACHER.SOLVER.AMP = CN({"ENABLED": False})
    
    # ---------------------------------------------------------------------------- #
    # Specific test options
    # ---------------------------------------------------------------------------- #
    cfg.TEACHER.TEST = CN()
    # For end-to-end tests to verify the expected accuracy.
    # Each item is [task, metric, value, tolerance]
    # e.g.: [['bbox', 'AP', 38.5, 0.2]]
    cfg.TEACHER.TEST.EXPECTED_RESULTS = []
    # The period (in terms of steps) to evaluate the model during training.
    # Set to 0 to disable.
    cfg.TEACHER.TEST.EVAL_PERIOD = 0
    # The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval
    # When empty, it will use the defaults in COCO.
    # Otherwise it should be a list[float] with the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
    cfg.TEACHER.TEST.KEYPOINT_OKS_SIGMAS = []
    # Maximum number of detections to return per image during inference (100 is
    # based on the limit established for the COCO dataset).
    cfg.TEACHER.TEST.DETECTIONS_PER_IMAGE = 100
    
    cfg.TEACHER.TEST.AUG = CN({"ENABLED": False})
    cfg.TEACHER.TEST.AUG.MIN_SIZES = (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
    cfg.TEACHER.TEST.AUG.MAX_SIZE = 4000
    cfg.TEACHER.TEST.AUG.FLIP = True
    
    cfg.TEACHER.TEST.PRECISE_BN = CN({"ENABLED": False})
    cfg.TEACHER.TEST.PRECISE_BN.NUM_ITER = 200
    
    # ---------------------------------------------------------------------------- #
    # Misc options
    # ---------------------------------------------------------------------------- #
    # Directory where output files are written
    cfg.TEACHER.OUTPUT_DIR = "./output"
    # Set seed to negative to fully randomize everything.
    # Set seed to positive to use a fixed seed. Note that a fixed seed increases
    # reproducibility but does not guarantee fully deterministic behavior.
    # Disabling all parallelism further increases reproducibility.
    cfg.TEACHER.SEED = -1
    # Benchmark different cudnn algorithms.
    # If input images have very different sizes, this option will have large overhead
    # for about 10k iterations. It usually hurts total time, but can benefit for certain models.
    # If input images have the same or similar sizes, benchmark is often helpful.
    cfg.TEACHER.CUDNN_BENCHMARK = False
    # The period (in terms of steps) for minibatch visualization at train time.
    # Set to 0 to disable.
    cfg.TEACHER.VIS_PERIOD = 0
    
    # global config is for quick hack purposes.
    # You can set them in command line or config files,
    # and access it with:
    #
    # from detectron2.config import global_cfg
    # print(global_cfg.HACK)
    #
    # Do not commit any configs into it.
    cfg.TEACHER.GLOBAL = CN()
    cfg.TEACHER.GLOBAL.HACK = 1.0



================================================
FILE: detection/model/rcnn.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates.
import logging
import numpy as np
from typing import Dict, List, Optional, Tuple
import torch
from torch import nn

from detectron2.config import configurable
from detectron2.data.detection_utils import convert_image_to_rgb
from detectron2.structures import ImageList, Instances
from detectron2.utils.events import get_event_storage
from detectron2.utils.logger import log_first_n
from detectron2.modeling.backbone import Backbone, build_backbone
from detectron2.modeling.postprocessing import detector_postprocess
from detectron2.modeling.proposal_generator import build_proposal_generator
from detectron2.modeling.roi_heads import build_roi_heads
from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
from mdistiller.distillers.DKD import dkd_loss

from .teacher import build_teacher
from .reviewkd import build_kd_trans, hcl


__all__ = ["RCNNKD", "ProposalNetwork"]

def rcnn_dkd_loss(stu_predictions, tea_predictions, gt_classes, alpha, beta, temperature):
    stu_logits, stu_bbox_offsets = stu_predictions
    tea_logits, tea_bbox_offsets = tea_predictions
    gt_classes = torch.cat(tuple(gt_classes), 0).reshape(-1)
    loss_dkd = dkd_loss(stu_logits, tea_logits, gt_classes, alpha, beta, temperature)
    return {
        'loss_dkd': loss_dkd,
    }

@META_ARCH_REGISTRY.register()
class RCNNKD(nn.Module):
    """
    Generalized R-CNN. Any models that contains the following three components:
    1. Per-image feature extraction (aka backbone)
    2. Region proposal generation
    3. Per-region feature extraction and prediction
    """

    @configurable
    def __init__(
        self,
        *,
        backbone: Backbone,
        proposal_generator: nn.Module,
        roi_heads: nn.Module,
        pixel_mean: Tuple[float],
        pixel_std: Tuple[float],
        teacher_pixel_mean: Tuple[float],
        teacher_pixel_std: Tuple[float],
        teacher: nn.Module,
        kd_args,
        input_format: Optional[str] = None,
        teacher_input_format: Optional[str] = None,
        vis_period: int = 0,
    ):
        """
        Args:
            backbone: a backbone module, must follow detectron2's backbone interface
            proposal_generator: a module that generates proposals using backbone features
            roi_heads: a ROI head that performs per-region computation
            pixel_mean, pixel_std: list or tuple with #channels element, representing
                the per-channel mean and std to be used to normalize the input image
            input_format: describe the meaning of channels of input. Needed by visualization
            vis_period: the period to run visualization. Set to 0 to disable.
        """
        super().__init__()
        self.backbone = backbone
        self.proposal_generator = proposal_generator
        self.roi_heads = roi_heads
        self.teacher = teacher
        self.kd_args = kd_args
        if self.kd_args.TYPE in ("ReviewKD", "ReviewDKD"):
            self.kd_trans = build_kd_trans(self.kd_args)

        self.input_format = input_format
        self.teacher_input_format = teacher_input_format
        self.vis_period = vis_period
        if vis_period > 0:
            assert input_format is not None, "input_format is required for visualization!"

        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
        self.register_buffer("teacher_pixel_mean", torch.tensor(teacher_pixel_mean).view(-1, 1, 1), False)
        self.register_buffer("teacher_pixel_std", torch.tensor(teacher_pixel_std).view(-1, 1, 1), False)
        assert (
            self.pixel_mean.shape == self.pixel_std.shape
        ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"

    @classmethod
    def from_config(cls, cfg):
        backbone = build_backbone(cfg)
        return {
            "backbone": backbone,
            "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
            "roi_heads": build_roi_heads(cfg, backbone.output_shape()),
            "input_format": cfg.INPUT.FORMAT,
            "vis_period": cfg.VIS_PERIOD,
            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
            "pixel_std": cfg.MODEL.PIXEL_STD,
            "kd_args": cfg.KD,
            "teacher": build_teacher(cfg),
            "teacher_input_format": cfg.TEACHER.INPUT.FORMAT,
            "teacher_pixel_mean": cfg.TEACHER.MODEL.PIXEL_MEAN,
            "teacher_pixel_std": cfg.TEACHER.MODEL.PIXEL_STD,
            
        }

    @property
    def device(self):
        return self.pixel_mean.device

    def visualize_training(self, batched_inputs, proposals):
        """
        A function used to visualize images and proposals. It shows ground truth
        bounding boxes on the original image and up to 20 top-scoring predicted
        object proposals on the original image. Users can implement different
        visualization functions for different models.

        Args:
            batched_inputs (list): a list that contains input to the model.
            proposals (list): a list that contains predicted proposals. Both
                batched_inputs and proposals should have the same length.
        """
        from detectron2.utils.visualizer import Visualizer

        storage = get_event_storage()
        max_vis_prop = 20

        for input, prop in zip(batched_inputs, proposals):
            img = input["image"]
            img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
            v_gt = Visualizer(img, None)
            v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
            anno_img = v_gt.get_image()
            box_size = min(len(prop.proposal_boxes), max_vis_prop)
            v_pred = Visualizer(img, None)
            v_pred = v_pred.overlay_instances(
                boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
            )
            prop_img = v_pred.get_image()
            vis_img = np.concatenate((anno_img, prop_img), axis=1)
            vis_img = vis_img.transpose(2, 0, 1)
            vis_name = "Left: GT bounding boxes;  Right: Predicted proposals"
            storage.put_image(vis_name, vis_img)
            break  # only visualize one image in a batch

    def forward_pure_roi_head(self, roi_head, features, proposals):
        features = [features[f] for f in roi_head.box_in_features]
        box_features = roi_head.box_pooler(features, [x.proposal_boxes for x in proposals])
        box_features = roi_head.box_head(box_features)
        predictions = roi_head.box_predictor(box_features)
        return predictions

    def forward(self, batched_inputs: Tuple[Dict[str, torch.Tensor]]):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (C, H, W) format.
                * instances (optional): groundtruth :class:`Instances`
                * proposals (optional): :class:`Instances`, precomputed proposals.

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                  See :meth:`postprocess` for details.

        Returns:
            list[dict]:
                Each dict is the output for one input image.
                The dict contains one key "instances" whose value is a :class:`Instances`.
                The :class:`Instances` object has the following keys:
                "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
        """
        if not self.training:
            return self.inference(batched_inputs)

        images = self.preprocess_image(batched_inputs)
        if "instances" in batched_inputs[0]:
            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
        else:
            gt_instances = None

        features = self.backbone(images.tensor)
        losses = {}
        if self.proposal_generator is not None:
            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
        else:
            assert "proposals" in batched_inputs[0]
            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
            proposal_losses = {}

        sampled_proposals, detector_losses = self.roi_heads(images, features, proposals, gt_instances)

        if self.kd_args.TYPE == "DKD":
            teacher_images = self.teacher_preprocess_image(batched_inputs)
            t_features = self.teacher.backbone(teacher_images.tensor)
            stu_predictions = self.forward_pure_roi_head(self.roi_heads, features, sampled_proposals)
            tea_predictions = self.forward_pure_roi_head(self.teacher.roi_heads, t_features, sampled_proposals)
            detector_losses.update(rcnn_dkd_loss(
                stu_predictions, tea_predictions, [x.gt_classes for x in sampled_proposals], 
                self.kd_args.DKD.ALPHA, self.kd_args.DKD.BETA, self.kd_args.DKD.T))
        elif self.kd_args.TYPE == "ReviewKD":
            teacher_images = self.teacher_preprocess_image(batched_inputs)
            t_features = self.teacher.backbone(teacher_images.tensor)
            t_features = [t_features[f] for f in t_features]
            s_features = [features[f] for f in features]
            s_features = self.kd_trans(s_features)
            losses['loss_reviewkd'] = hcl(s_features, t_features) * self.kd_args.REVIEWKD.LOSS_WEIGHT
        elif self.kd_args.TYPE == "ReviewDKD":
            teacher_images = self.teacher_preprocess_image(batched_inputs)
            t_features = self.teacher.backbone(teacher_images.tensor)
            # dkd loss
            stu_predictions = self.forward_pure_roi_head(self.roi_heads, features, sampled_proposals)
            tea_predictions = self.forward_pure_roi_head(self.teacher.roi_heads, t_features, sampled_proposals)
            detector_losses.update(rcnn_dkd_loss(
                stu_predictions, tea_predictions, [x.gt_classes for x in sampled_proposals], 
                self.kd_args.DKD.ALPHA, self.kd_args.DKD.BETA, self.kd_args.DKD.T))
            # reviewkd loss
            t_features = [t_features[f] for f in t_features]
            s_features = [features[f] for f in features]
            s_features = self.kd_trans(s_features)
            losses['loss_reviewkd'] = hcl(s_features, t_features) * self.kd_args.REVIEWKD.LOSS_WEIGHT
        else:
            raise NotImplementedError(self.kd_args.TYPE)
        if self.vis_period > 0:
            storage = get_event_storage()
            if storage.iter % self.vis_period == 0:
                self.visualize_training(batched_inputs, proposals)

        losses.update(detector_losses)
        losses.update(proposal_losses)
        return losses

    def inference(
        self,
        batched_inputs: Tuple[Dict[str, torch.Tensor]],
        detected_instances: Optional[List[Instances]] = None,
        do_postprocess: bool = True,
    ):
        """
        Run inference on the given inputs.

        Args:
            batched_inputs (list[dict]): same as in :meth:`forward`
            detected_instances (None or list[Instances]): if not None, it
                contains an `Instances` object per image. The `Instances`
                object contains "pred_boxes" and "pred_classes" which are
                known boxes in the image.
                The inference will then skip the detection of bounding boxes,
                and only predict other per-ROI outputs.
            do_postprocess (bool): whether to apply post-processing on the outputs.

        Returns:
            When do_postprocess=True, same as in :meth:`forward`.
            Otherwise, a list[Instances] containing raw network outputs.
        """
        assert not self.training

        images = self.preprocess_image(batched_inputs)
        features = self.backbone(images.tensor)

        if detected_instances is None:
            if self.proposal_generator is not None:
                proposals, _ = self.proposal_generator(images, features, None)
            else:
                assert "proposals" in batched_inputs[0]
                proposals = [x["proposals"].to(self.device) for x in batched_inputs]

            results, _ = self.roi_heads(images, features, proposals, None)
        else:
            detected_instances = [x.to(self.device) for x in detected_instances]
            results = self.roi_heads.forward_with_given_boxes(features, detected_instances)

        if do_postprocess:
            assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
            return RCNNKD._postprocess(results, batched_inputs, images.image_sizes)
        else:
            return results

    def preprocess_image(self, batched_inputs: Tuple[Dict[str, torch.Tensor]]):
        """
        Normalize, pad and batch the input images.
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
        return images

    def teacher_preprocess_image(self, batched_inputs: Tuple[Dict[str, torch.Tensor]]):
        """
        Normalize, pad and batch the input images.
        """
        images = [x["image"].to(self.device) for x in batched_inputs]
        images = [(x - self.teacher_pixel_mean) / self.teacher_pixel_std for x in images]
        if self.input_format != self.teacher_input_format:
            images = [x.index_select(0,torch.LongTensor([2,1,0]).to(self.device)) for x in images]
        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
        return images

    @staticmethod
    def _postprocess(instances, batched_inputs: Tuple[Dict[str, torch.Tensor]], image_sizes):
        """
        Rescale the output instances to the target size.
        """
        # note: private function; subject to changes
        processed_results = []
        for results_per_image, input_per_image, image_size in zip(
            instances, batched_inputs, image_sizes
        ):
            height = input_per_image.get("height", image_size[0])
            width = input_per_image.get("width", image_size[1])
            r = detector_postprocess(results_per_image, height, width)
            processed_results.append({"instances": r})
        return processed_results


================================================
FILE: detection/model/reviewkd.py
================================================
import torch
from torch import nn
import torch.nn.functional as F

class ABF(nn.Module):
    def __init__(self, in_channel, mid_channel, out_channel, fuse):
        super(ABF, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channel, mid_channel, kernel_size=1, bias=False),
            nn.BatchNorm2d(mid_channel),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(mid_channel, out_channel,kernel_size=3,stride=1,padding=1,bias=False),
            nn.BatchNorm2d(out_channel),
        )
        if fuse:
            self.att_conv = nn.Sequential(
                    nn.Conv2d(mid_channel*2, 2, kernel_size=1),
                    nn.Sigmoid(),
                )
        else:
            self.att_conv = None
        nn.init.kaiming_uniform_(self.conv1[0].weight, a=1)  # pyre-ignore
        nn.init.kaiming_uniform_(self.conv2[0].weight, a=1)  # pyre-ignore

    def forward(self, x, y=None, shape=None):
        n,_,h,w = x.shape
        # transform student features
        x = self.conv1(x)
        if self.att_conv is not None:
            # upsample residual features
            shape = x.shape[-2:]
            y = F.interpolate(y, shape, mode="nearest")
            # fusion
            z = torch.cat([x, y], dim=1)
            z = self.att_conv(z)
            x = (x * z[:,0].view(n,1,h,w) + y * z[:,1].view(n,1,h,w))
        # output 
        y = self.conv2(x)
        return y, x

class ReviewKD(nn.Module):
    def __init__(
        self, in_channels, out_channels, mid_channel
    ):
        super(ReviewKD, self).__init__()

        abfs = nn.ModuleList()

        for idx, in_channel in enumerate(in_channels):
            abfs.append(ABF(in_channel, mid_channel, out_channels[idx], idx < len(in_channels)-1))


        self.abfs = abfs[::-1]

    def forward(self, student_features):
        x = student_features[::-1]
        results = []
        out_features, res_features = self.abfs[0](x[0])
        results.append(out_features)
        for features, abf in zip(x[1:], self.abfs[1:]):
            out_features, res_features = abf(features, res_features)
            results.insert(0, out_features)

        return results


def build_kd_trans(cfg):
    in_channels = [256,256,256,256,256]
    out_channels = [256,256,256,256,256]
    mid_channel = 256
    model = ReviewKD(in_channels, out_channels, mid_channel)
    return model

def hcl(fstudent, fteacher):
    loss_all = 0.0
    for fs, ft in zip(fstudent, fteacher):
        n,c,h,w = fs.shape
        loss = F.mse_loss(fs, ft, reduction='mean')
        cnt = 1.0
        tot = 1.0
        for l in [4,2,1]:
            if l >=h:
                continue
            tmpfs = F.adaptive_avg_pool2d(fs, (l,l))
            tmpft = F.adaptive_avg_pool2d(ft, (l,l))
            cnt /= 2.0
            loss += F.mse_loss(tmpfs, tmpft, reduction='mean') * cnt
            tot += cnt
        loss = loss / tot
        loss_all = loss_all + loss
    return loss_all


================================================
FILE: detection/model/teacher/__init__.py
================================================
from .teacher import build_teacher


================================================
FILE: detection/model/teacher/teacher.py
================================================
from detectron2.modeling.backbone import build_backbone
from detectron2.modeling.proposal_generator import build_proposal_generator
from detectron2.modeling.roi_heads import build_roi_heads
from detectron2.checkpoint import DetectionCheckpointer

from torch import nn

class Teacher(nn.Module):
    def __init__(self, backbone, proposal_generator, roi_heads):
        super().__init__()
        self.backbone = backbone
        self.proposal_generator = proposal_generator
        self.roi_heads = roi_heads

def build_teacher(cfg):
    teacher_cfg = cfg.TEACHER
    backbone = build_backbone(teacher_cfg)
    if not 'Retina' in teacher_cfg.MODEL.META_ARCHITECTURE:
        proposal_generator = build_proposal_generator(teacher_cfg, backbone.output_shape())
        roi_heads = build_roi_heads(teacher_cfg, backbone.output_shape())
    else:
        proposal_generator = None
        roi_heads = None
    teacher = Teacher(backbone, proposal_generator, roi_heads)
    for param in teacher.parameters():
        param.requires_grad = False
    return teacher




================================================
FILE: detection/train_net.py
================================================
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
"""
A main training script.

This scripts reads a given config file and runs the training or evaluation.
It is an entry point that is made to train standard models in detectron2.

In order to let one script support training of many models,
this script contains logic that are specific to these built-in models and therefore
may not be suitable for your own project.
For example, your research project perhaps only needs a single "evaluator".

Therefore, we recommend you to use detectron2 as an library and take
this file as an example of how to use the library.
You may want to write your own script with your datasets and other customizations.
"""

import logging
import os
from collections import OrderedDict
import torch

import detectron2.utils.comm as comm
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.config import get_cfg
from detectron2.data import MetadataCatalog
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch
from detectron2.evaluation import (
    CityscapesInstanceEvaluator,
    CityscapesSemSegEvaluator,
    COCOEvaluator,
    COCOPanopticEvaluator,
    DatasetEvaluators,
    LVISEvaluator,
    PascalVOCDetectionEvaluator,
    SemSegEvaluator,
    verify_results,
)
from detectron2.modeling import GeneralizedRCNNWithTTA

from model import add_distillation_cfg
from model import RCNNKD


class Trainer(DefaultTrainer):
    """
    We use the "DefaultTrainer" which contains pre-defined default logic for
    standard training workflow. They may not work for you, especially if you
    are working on a new research project. In that case you can write your
    own training loop. You can use "tools/plain_train_net.py" as an example.
    """

    @classmethod
    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
        """
        Create evaluator(s) for a given dataset.
        This uses the special metadata "evaluator_type" associated with each builtin dataset.
        For your own dataset, you can simply create an evaluator manually in your
        script and do not have to worry about the hacky if-else logic here.
        """
        if output_folder is None:
            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
        evaluator_list = []
        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
        if evaluator_type in ["sem_seg", "coco_panoptic_seg"]:
            evaluator_list.append(
                SemSegEvaluator(
                    dataset_name,
                    distributed=True,
                    output_dir=output_folder,
                )
            )
        if evaluator_type in ["coco", "coco_panoptic_seg"]:
            evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
        if evaluator_type == "coco_panoptic_seg":
            evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
        if evaluator_type == "cityscapes_instance":
            assert (
                torch.cuda.device_count() >= comm.get_rank()
            ), "CityscapesEvaluator currently do not work with multiple machines."
            return CityscapesInstanceEvaluator(dataset_name)
        if evaluator_type == "cityscapes_sem_seg":
            assert (
                torch.cuda.device_count() >= comm.get_rank()
            ), "CityscapesEvaluator currently do not work with multiple machines."
            return CityscapesSemSegEvaluator(dataset_name)
        elif evaluator_type == "pascal_voc":
            return PascalVOCDetectionEvaluator(dataset_name)
        elif evaluator_type == "lvis":
            return LVISEvaluator(dataset_name, output_dir=output_folder)
        if len(evaluator_list) == 0:
            raise NotImplementedError(
                "no Evaluator for the dataset {} with the type {}".format(
                    dataset_name, evaluator_type
                )
            )
        elif len(evaluator_list) == 1:
            return evaluator_list[0]
        return DatasetEvaluators(evaluator_list)

    @classmethod
    def test_with_TTA(cls, cfg, model):
        logger = logging.getLogger("detectron2.trainer")
        # In the end of training, run an evaluation with TTA
        # Only support some R-CNN models.
        logger.info("Running inference with test-time augmentation ...")
        model = GeneralizedRCNNWithTTA(cfg, model)
        evaluators = [
            cls.build_evaluator(
                cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
            )
            for name in cfg.DATASETS.TEST
        ]
        res = cls.test(cfg, model, evaluators)
        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
        return res


def setup(args):
    """
    Create configs and perform basic setups.
    """
    cfg = get_cfg()
    add_distillation_cfg(cfg)
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()
    default_setup(cfg, args)
    return cfg


def main(args):
    cfg = setup(args)

    if args.eval_only:
        model = Trainer.build_model(cfg)
        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
            cfg.MODEL.WEIGHTS, resume=args.resume
        )
        res = Trainer.test(cfg, model)
        if cfg.TEST.AUG.ENABLED:
            res.update(Trainer.test_with_TTA(cfg, model))
        if comm.is_main_process():
            verify_results(cfg, res)
        return res

    """
    If you'd like to do anything fancier than the standard training logic,
    consider writing your own training loop (see plain_train_net.py) or
    subclassing the trainer.
    """
    trainer = Trainer(cfg)
    trainer.resume_or_load(resume=args.resume)
    if cfg.TEST.AUG.ENABLED:
        trainer.register_hooks(
            [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
        )
    return trainer.train()


if __name__ == "__main__":
    args = default_argument_parser().parse_args()
    print("Command Line Args:", args)
    launch(
        main,
        args.num_gpus,
        num_machines=args.num_machines,
        machine_rank=args.machine_rank,
        dist_url=args.dist_url,
        args=(args,),
    )


================================================
FILE: mdistiller/__init__.py
================================================


================================================
FILE: mdistiller/dataset/__init__.py
================================================
from .cifar100 import get_cifar100_dataloaders, get_cifar100_dataloaders_sample
from .imagenet import get_imagenet_dataloaders, get_imagenet_dataloaders_sample
from .tiny_imagenet import get_tinyimagenet_dataloader, get_tinyimagenet_dataloader_sample


def get_dataset(cfg):
    if cfg.DATASET.TYPE == "cifar100":
        if cfg.DISTILLER.TYPE == "CRD":
            train_loader, val_loader, num_data = get_cifar100_dataloaders_sample(
                batch_size=cfg.SOLVER.BATCH_SIZE,
                val_batch_size=cfg.DATASET.TEST.BATCH_SIZE,
                num_workers=cfg.DATASET.NUM_WORKERS,
                k=cfg.CRD.NCE.K,
                mode=cfg.CRD.MODE,
            )
        else:
            train_loader, val_loader, num_data = get_cifar100_dataloaders(
                batch_size=cfg.SOLVER.BATCH_SIZE,
                val_batch_size=cfg.DATASET.TEST.BATCH_SIZE,
                num_workers=cfg.DATASET.NUM_WORKERS,
            )
        num_classes = 100
    elif cfg.DATASET.TYPE == "imagenet":
        if cfg.DISTILLER.TYPE == "CRD":
            train_loader, val_loader, num_data = get_imagenet_dataloaders_sample(
                batch_size=cfg.SOLVER.BATCH_SIZE,
                val_batch_size=cfg.DATASET.TEST.BATCH_SIZE,
                num_workers=cfg.DATASET.NUM_WORKERS,
                k=cfg.CRD.NCE.K,
            )
        else:
            train_loader, val_loader, num_data = get_imagenet_dataloaders(
                batch_size=cfg.SOLVER.BATCH_SIZE,
                val_batch_size=cfg.DATASET.TEST.BATCH_SIZE,
                num_workers=cfg.DATASET.NUM_WORKERS,
            )
        num_classes = 1000
    elif cfg.DATASET.TYPE == "tiny_imagenet":
        if cfg.DISTILLER.TYPE in ("CRD", "CRDKD"):
            train_loader, val_loader, num_data = get_tinyimagenet_dataloader_sample(
                batch_size=cfg.SOLVER.BATCH_SIZE,
                val_batch_size=cfg.DATASET.TEST.BATCH_SIZE,
                num_workers=cfg.DATASET.NUM_WORKERS,
                k=cfg.CRD.NCE.K,
            )
        else:
            train_loader, val_loader, num_data = get_tinyimagenet_dataloader(
                batch_size=cfg.SOLVER.BATCH_SIZE,
                val_batch_size=cfg.DATASET.TEST.BATCH_SIZE,
                num_workers=cfg.DATASET.NUM_WORKERS,
            )
        num_classes = 200
    else:
        raise NotImplementedError(cfg.DATASET.TYPE)

    return train_loader, val_loader, num_data, num_classes


================================================
FILE: mdistiller/dataset/cifar100.py
================================================
import os
import numpy as np
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from PIL import Image


def get_data_folder():
    data_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../data")
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)
    return data_folder


class CIFAR100Instance(datasets.CIFAR100):
    """CIFAR100Instance Dataset."""

    def __getitem__(self, index):
        img, target = super().__getitem__(index)
        return img, target, index


# CIFAR-100 for CRD
class CIFAR100InstanceSample(datasets.CIFAR100):
    """
    CIFAR100Instance+Sample Dataset
    """

    def __init__(
        self,
        root,
        train=True,
        transform=None,
        target_transform=None,
        download=False,
        k=4096,
        mode="exact",
        is_sample=True,
        percent=1.0,
    ):
        super().__init__(
            root=root,
            train=train,
            download=download,
            transform=transform,
            target_transform=target_transform,
        )
        self.k = k
        self.mode = mode
        self.is_sample = is_sample

        num_classes = 100
        num_samples = len(self.data)
        label = self.targets

        self.cls_positive = [[] for i in range(num_classes)]
        for i in range(num_samples):
            self.cls_positive[label[i]].append(i)

        self.cls_negative = [[] for i in range(num_classes)]
        for i in range(num_classes):
            for j in range(num_classes):
                if j == i:
                    continue
                self.cls_negative[i].extend(self.cls_positive[j])

        self.cls_positive = [
            np.asarray(self.cls_positive[i]) for i in range(num_classes)
        ]
        self.cls_negative = [
            np.asarray(self.cls_negative[i]) for i in range(num_classes)
        ]

        if 0 < percent < 1:
            n = int(len(self.cls_negative[0]) * percent)
            self.cls_negative = [
                np.random.permutation(self.cls_negative[i])[0:n]
                for i in range(num_classes)
            ]

        self.cls_positive = np.asarray(self.cls_positive)
        self.cls_negative = np.asarray(self.cls_negative)

    def __getitem__(self, index):
        img, target = self.data[index], self.targets[index]

        # doing this so that it is consistent with all other datasets
        # to return a PIL Image
        img = Image.fromarray(img)

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        if not self.is_sample:
            # directly return
            return img, target, index
        else:
            # sample contrastive examples
            if self.mode == "exact":
                pos_idx = index
            elif self.mode == "relax":
                pos_idx = np.random.choice(self.cls_positive[target], 1)
                pos_idx = pos_idx[0]
            else:
                raise NotImplementedError(self.mode)
            replace = True if self.k > len(self.cls_negative[target]) else False
            neg_idx = np.random.choice(
                self.cls_negative[target], self.k, replace=replace
            )
            sample_idx = np.hstack((np.asarray([pos_idx]), neg_idx))
            return img, target, index, sample_idx


def get_cifar100_train_transform():
    train_transform = transforms.Compose(
        [
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
        ]
    )

    return train_transform


def get_cifar100_test_transform():
    return transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
        ]
    )


def get_cifar100_dataloaders(batch_size, val_batch_size, num_workers):
    data_folder = get_data_folder()
    train_transform = get_cifar100_train_transform()
    test_transform = get_cifar100_test_transform()
    train_set = CIFAR100Instance(
        root=data_folder, download=True, train=True, transform=train_transform
    )
    num_data = len(train_set)
    test_set = datasets.CIFAR100(
        root=data_folder, download=True, train=False, transform=test_transform
    )

    train_loader = DataLoader(
        train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers
    )
    test_loader = DataLoader(
        test_set,
        batch_size=val_batch_size,
        shuffle=False,
        num_workers=1,
    )
    return train_loader, test_loader, num_data


# CIFAR-100 for CRD
def get_cifar100_dataloaders_sample(
    batch_size, val_batch_size, num_workers, k, mode="exact"
):
    data_folder = get_data_folder()
    train_transform = get_cifar100_train_transform()
    test_transform = get_cifar100_test_transform()

    train_set = CIFAR100InstanceSample(
        root=data_folder,
        download=True,
        train=True,
        transform=train_transform,
        k=k,
        mode=mode,
        is_sample=True,
        percent=1.0,
    )
    num_data = len(train_set)
    test_set = datasets.CIFAR100(
        root=data_folder, download=True, train=False, transform=test_transform
    )

    train_loader = DataLoader(
        train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers
    )
    test_loader = DataLoader(
        test_set,
        batch_size=val_batch_size,
        shuffle=False,
        num_workers=num_workers,
    )
    return train_loader, test_loader, num_data


================================================
FILE: mdistiller/dataset/imagenet.py
================================================
import os
import numpy as np
import torch
from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms


data_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../data/imagenet')


class ImageNet(ImageFolder):
    def __getitem__(self, index):
        img, target = super().__getitem__(index)
        return img, target, index


class ImageNetInstanceSample(ImageNet):
    """: Folder datasets which returns (img, label, index, contrast_index):
    """
    def __init__(self, folder, transform=None, target_transform=None,
                 is_sample=False, k=4096):
        super().__init__(folder, transform=transform)

        self.k = k
        self.is_sample = is_sample
        if self.is_sample:
            print('preparing contrastive data...')
            num_classes = 1000
            num_samples = len(self.samples)
            label = np.zeros(num_samples, dtype=np.int32)
            for i in range(num_samples):
                _, target = self.samples[i]
                label[i] = target

            self.cls_positive = [[] for i in range(num_classes)]
            for i in range(num_samples):
                self.cls_positive[label[i]].append(i)

            self.cls_negative = [[] for i in range(num_classes)]
            for i in range(num_classes):
                for j in range(num_classes):
                    if j == i:
                        continue
                    self.cls_negative[i].extend(self.cls_positive[j])

            self.cls_positive = [np.asarray(self.cls_positive[i], dtype=np.int32) for i in range(num_classes)]
            self.cls_negative = [np.asarray(self.cls_negative[i], dtype=np.int32) for i in range(num_classes)]
            print('done.')

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: (image, target) where target is class_index of the target class.
        """
        img, target, index = super().__getitem__(index)

        if self.is_sample:
            # sample contrastive examples
            pos_idx = index
            neg_idx = np.random.choice(self.cls_negative[target], self.k, replace=True)
            sample_idx = np.hstack((np.asarray([pos_idx]), neg_idx))
            return img, target, index, sample_idx
        else:
            return img, target, index

def get_imagenet_train_transform(mean, std):
    normalize = transforms.Normalize(mean=mean, std=std)
    train_transform = transforms.Compose(
        [
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]
    )
    return train_transform

def get_imagenet_test_transform(mean, std):
    normalize = transforms.Normalize(mean=mean, std=std)
    test_transform = transforms.Compose(
        [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ]
    )
    return test_transform

def get_imagenet_dataloaders(batch_size, val_batch_size, num_workers,
    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
    train_transform = get_imagenet_train_transform(mean, std)
    train_folder = os.path.join(data_folder, 'train')
    train_set = ImageNet(train_folder, transform=train_transform)
    num_data = len(train_set)
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, 
        shuffle=True, num_workers=num_workers, pin_memory=True)
    test_loader = get_imagenet_val_loader(val_batch_size, mean, std)
    return train_loader, test_loader, num_data

def get_imagenet_dataloaders_sample(batch_size, val_batch_size, num_workers, k=4096, 
    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
    train_transform = get_imagenet_train_transform(mean, std)
    train_folder = os.path.join(data_folder, 'train')
    train_set = ImageNetInstanceSample(train_folder, transform=train_transform, is_sample=True, k=k)
    num_data = len(train_set)
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, 
        shuffle=True, num_workers=num_workers, pin_memory=True)
    test_loader = get_imagenet_val_loader(val_batch_size, mean, std)
    return train_loader, test_loader, num_data

def get_imagenet_val_loader(val_batch_size, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
    test_transform = get_imagenet_test_transform(mean, std)
    test_folder = os.path.join(data_folder, 'val')
    test_set = ImageFolder(test_folder, transform=test_transform)
    test_loader = torch.utils.data.DataLoader(test_set,
        batch_size=val_batch_size, shuffle=False, num_workers=16, pin_memory=True)
    return test_loader


================================================
FILE: mdistiller/dataset/tiny_imagenet.py
================================================
import os
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
import numpy as np


data_folder = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), "../../data/tiny-imagenet-200"
)


class ImageFolderInstance(datasets.ImageFolder):
    def __getitem__(self, index):
        path, target = self.imgs[index]
        img = self.loader(path)
        if self.transform is not None:
            img = self.transform(img)
        return img, target, index


class ImageFolderInstanceSample(ImageFolderInstance):
    """: Folder datasets which returns (img, label, index, contrast_index):
    """
    def __init__(self, folder, transform=None, target_transform=None,
                 is_sample=False, k=4096):
        super().__init__(folder, transform=transform)

        self.k = k
        self.is_sample = is_sample
        if self.is_sample:
            num_classes = 200
            num_samples = len(self.samples)
            label = np.zeros(num_samples, dtype=np.int32)
            for i in range(num_samples):
                img, target = self.samples[i]
                label[i] = target

            self.cls_positive = [[] for i in range(num_classes)]
            for i in range(num_samples):
                self.cls_positive[label[i]].append(i)

            self.cls_negative = [[] for i in range(num_classes)]
            for i in range(num_classes):
                for j in range(num_classes):
                    if j == i:
                        continue
                    self.cls_negative[i].extend(self.cls_positive[j])

            self.cls_positive = [np.asarray(self.cls_positive[i], dtype=np.int32) for i in range(num_classes)]
            self.cls_negative = [np.asarray(self.cls_negative[i], dtype=np.int32) for i in range(num_classes)]
        print('dataset initialized!')

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: (image, target) where target is class_index of the target class.
        """
        img, target, index = super().__getitem__(index)

        if self.is_sample:
            # sample contrastive examples
            pos_idx = index
            neg_idx = np.random.choice(self.cls_negative[target], self.k, replace=True)
            sample_idx = np.hstack((np.asarray([pos_idx]), neg_idx))
            return img, target, index, sample_idx
        else:
            return img, target, index


def get_tinyimagenet_dataloader(batch_size, val_batch_size, num_workers):
    """Data Loader for tiny-imagenet"""
    train_transform = transforms.Compose([
                transforms.RandomRotation(20),
                transforms.RandomHorizontalFlip(0.5),
                transforms.ToTensor(),
                transforms.Normalize([0.4802, 0.4481, 0.3975], [0.2302, 0.2265, 0.2262]),
            ])
    test_transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize([0.4802, 0.4481, 0.3975], [0.2302, 0.2265, 0.2262]),
            ])
    train_folder = os.path.join(data_folder, "train")
    test_folder = os.path.join(data_folder, "val")
    train_set = ImageFolderInstance(train_folder, transform=train_transform)
    num_data = len(train_set)
    test_set = datasets.ImageFolder(test_folder, transform=test_transform)
    train_loader = DataLoader(
        train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers
    )
    test_loader = DataLoader(
        test_set, batch_size=val_batch_size, shuffle=False, num_workers=1
    )
    return train_loader, test_loader, num_data


def get_tinyimagenet_dataloader_sample(batch_size, val_batch_size, num_workers, k):
    """Data Loader for tiny-imagenet"""
    train_transform = transforms.Compose([
                transforms.RandomRotation(20),
                transforms.RandomHorizontalFlip(0.5),
                transforms.ToTensor(),
                transforms.Normalize([0.4802, 0.4481, 0.3975], [0.2302, 0.2265, 0.2262]),
            ])
    test_transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize([0.4802, 0.4481, 0.3975], [0.2302, 0.2265, 0.2262]),
            ])
    train_folder = os.path.join(data_folder, "train")
    test_folder = os.path.join(data_folder, "val")
    train_set = ImageFolderInstanceSample(train_folder, transform=train_transform, is_sample=True, k=k)
    num_data = len(train_set)
    test_set = datasets.ImageFolder(test_folder, transform=test_transform)
    train_loader = DataLoader(
        train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers
    )
    test_loader = DataLoader(
        test_set, batch_size=val_batch_size, shuffle=False, num_workers=1
    )
    return train_loader, test_loader, num_data


================================================
FILE: mdistiller/distillers/AT.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from ._base import Distiller


def single_stage_at_loss(f_s, f_t, p):
    def _at(feat, p):
        return F.normalize(feat.pow(p).mean(1).reshape(feat.size(0), -1))

    s_H, t_H = f_s.shape[2], f_t.shape[2]
    if s_H > t_H:
        f_s = F.adaptive_avg_pool2d(f_s, (t_H, t_H))
    elif s_H < t_H:
        f_t = F.adaptive_avg_pool2d(f_t, (s_H, s_H))
    return (_at(f_s, p) - _at(f_t, p)).pow(2).mean()


def at_loss(g_s, g_t, p):
    return sum([single_stage_at_loss(f_s, f_t, p) for f_s, f_t in zip(g_s, g_t)])


class AT(Distiller):
    """
    Paying More Attention to Attention: Improving the Performance of Convolutional Neural Networks via Attention Transfer
    src code: https://github.com/szagoruyko/attention-transfer
    """

    def __init__(self, student, teacher, cfg):
        super(AT, self).__init__(student, teacher)
        self.p = cfg.AT.P
        self.ce_loss_weight = cfg.AT.LOSS.CE_WEIGHT
        self.feat_loss_weight = cfg.AT.LOSS.FEAT_WEIGHT

    def forward_train(self, image, target, **kwargs):
        logits_student, feature_student = self.student(image)
        with torch.no_grad():
            _, feature_teacher = self.teacher(image)

        # losses
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_feat = self.feat_loss_weight * at_loss(
            feature_student["feats"][1:], feature_teacher["feats"][1:], self.p
        )
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_feat,
        }
        return logits_student, losses_dict


================================================
FILE: mdistiller/distillers/CRD.py
================================================
import torch
from torch import nn
import torch.nn.functional as F
import math

from ._base import Distiller


class CRD(Distiller):
    """Contrastive Representation Distillation"""

    def __init__(self, student, teacher, cfg, num_data):
        super(CRD, self).__init__(student, teacher)
        self.ce_loss_weight = cfg.CRD.LOSS.CE_WEIGHT
        self.feat_loss_weight = cfg.CRD.LOSS.FEAT_WEIGHT
        self.init_crd_modules(
            cfg.CRD.FEAT.STUDENT_DIM,
            cfg.CRD.FEAT.TEACHER_DIM,
            cfg.CRD.FEAT.DIM,
            num_data,
            cfg.CRD.NCE.K,
            cfg.CRD.NCE.MOMENTUM,
            cfg.CRD.NCE.TEMPERATURE,
        )

    def init_crd_modules(
        self,
        feat_s_channel,
        feat_t_channel,
        feat_dim,
        num_data,
        k=16384,
        momentum=0.5,
        temperature=0.07,
    ):
        self.embed_s = Embed(feat_s_channel, feat_dim)
        self.embed_t = Embed(feat_t_channel, feat_dim)
        self.contrast = ContrastMemory(feat_dim, num_data, k, temperature, momentum)
        self.criterion_s = ContrastLoss(num_data)
        self.criterion_t = ContrastLoss(num_data)

    def get_learnable_parameters(self):
        return (
            super().get_learnable_parameters()
            + list(self.embed_s.parameters())
            + list(self.embed_t.parameters())
        )

    def get_extra_parameters(self):
        params = (
            list(self.embed_s.parameters())
            + list(self.embed_t.parameters())
            + list(self.contrast.buffers())
        )
        num_p = 0
        for p in params:
            num_p += p.numel()
        return num_p

    def crd_loss(self, f_s, f_t, idx, contrast_idx):
        f_s = self.embed_s(f_s)
        f_t = self.embed_t(f_t)
        out_s, out_t = self.contrast(f_s, f_t, idx, contrast_idx)
        s_loss = self.criterion_s(out_s)
        t_loss = self.criterion_t(out_t)
        return s_loss + t_loss

    def forward_train(self, image, target, index, contrastive_index, **kwargs):
        logits_student, feature_student = self.student(image)
        with torch.no_grad():
            _, feature_teacher = self.teacher(image)

        # losses
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_crd = self.feat_loss_weight * self.crd_loss(
            feature_student["pooled_feat"],
            feature_teacher["pooled_feat"],
            index,
            contrastive_index,
        )
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_crd,
        }
        return logits_student, losses_dict


class Normalize(nn.Module):
    """normalization layer"""

    def __init__(self, power=2):
        super(Normalize, self).__init__()
        self.power = power

    def forward(self, x):
        norm = x.pow(self.power).sum(1, keepdim=True).pow(1.0 / self.power)
        out = x.div(norm)
        return out


class Embed(nn.Module):
    """Embedding module"""

    def __init__(self, dim_in=1024, dim_out=128):
        super(Embed, self).__init__()
        self.linear = nn.Linear(dim_in, dim_out)
        self.l2norm = Normalize(2)

    def forward(self, x):
        x = x.reshape(x.shape[0], -1)
        x = self.linear(x)
        x = self.l2norm(x)
        return x


class ContrastLoss(nn.Module):
    """contrastive loss"""

    def __init__(self, num_data):
        super(ContrastLoss, self).__init__()
        self.num_data = num_data

    def forward(self, x):
        eps = 1e-7
        bsz = x.shape[0]
        m = x.size(1) - 1

        # noise distribution
        Pn = 1 / float(self.num_data)

        # loss for positive pair
        P_pos = x.select(1, 0)
        log_D1 = torch.div(P_pos, P_pos.add(m * Pn + eps)).log_()

        # loss for K negative pair
        P_neg = x.narrow(1, 1, m)
        log_D0 = torch.div(P_neg.clone().fill_(m * Pn), P_neg.add(m * Pn + eps)).log_()

        loss = -(log_D1.sum(0) + log_D0.view(-1, 1).sum(0)) / bsz

        return loss


class ContrastMemory(nn.Module):
    """memory buffer that supplies large amount of negative samples."""

    def __init__(self, inputSize, output_size, K, T=0.07, momentum=0.5):
        super(ContrastMemory, self).__init__()
        self.n_lem = output_size
        self.unigrams = torch.ones(self.n_lem)
        self.multinomial = AliasMethod(self.unigrams)
        self.multinomial.cuda()
        self.K = K

        self.register_buffer("params", torch.tensor([K, T, -1, -1, momentum]))
        stdv = 1.0 / math.sqrt(inputSize / 3)
        self.register_buffer(
            "memory_v1", torch.rand(output_size, inputSize).mul_(2 * stdv).add_(-stdv)
        )
        self.register_buffer(
            "memory_v2", torch.rand(output_size, inputSize).mul_(2 * stdv).add_(-stdv)
        )

    def forward(self, v1, v2, y, idx=None):
        K = int(self.params[0].item())
        T = self.params[1].item()
        Z_v1 = self.params[2].item()
        Z_v2 = self.params[3].item()

        momentum = self.params[4].item()
        batchSize = v1.size(0)
        outputSize = self.memory_v1.size(0)
        inputSize = self.memory_v1.size(1)

        # original score computation
        if idx is None:
            idx = self.multinomial.draw(batchSize * (self.K + 1)).view(batchSize, -1)
            idx.select(1, 0).copy_(y.data)
        # sample
        weight_v1 = torch.index_select(self.memory_v1, 0, idx.view(-1)).detach()
        weight_v1 = weight_v1.view(batchSize, K + 1, inputSize)
        out_v2 = torch.bmm(weight_v1, v2.view(batchSize, inputSize, 1))
        out_v2 = torch.exp(torch.div(out_v2, T))
        # sample
        weight_v2 = torch.index_select(self.memory_v2, 0, idx.view(-1)).detach()
        weight_v2 = weight_v2.view(batchSize, K + 1, inputSize)
        out_v1 = torch.bmm(weight_v2, v1.view(batchSize, inputSize, 1))
        out_v1 = torch.exp(torch.div(out_v1, T))

        # set Z if haven't been set yet
        if Z_v1 < 0:
            self.params[2] = out_v1.mean() * outputSize
            Z_v1 = self.params[2].clone().detach().item()
            # print("normalization constant Z_v1 is set to {:.1f}".format(Z_v1))
        if Z_v2 < 0:
            self.params[3] = out_v2.mean() * outputSize
            Z_v2 = self.params[3].clone().detach().item()
            # print("normalization constant Z_v2 is set to {:.1f}".format(Z_v2))

        # compute out_v1, out_v2
        out_v1 = torch.div(out_v1, Z_v1).contiguous()
        out_v2 = torch.div(out_v2, Z_v2).contiguous()

        # update memory
        with torch.no_grad():
            l_pos = torch.index_select(self.memory_v1, 0, y.view(-1))
            l_pos.mul_(momentum)
            l_pos.add_(torch.mul(v1, 1 - momentum))
            l_norm = l_pos.pow(2).sum(1, keepdim=True).pow(0.5)
            updated_v1 = l_pos.div(l_norm)
            self.memory_v1.index_copy_(0, y, updated_v1)

            ab_pos = torch.index_select(self.memory_v2, 0, y.view(-1))
            ab_pos.mul_(momentum)
            ab_pos.add_(torch.mul(v2, 1 - momentum))
            ab_norm = ab_pos.pow(2).sum(1, keepdim=True).pow(0.5)
            updated_v2 = ab_pos.div(ab_norm)
            self.memory_v2.index_copy_(0, y, updated_v2)

        return out_v1, out_v2


class AliasMethod(object):
    """
    From: https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
    """

    def __init__(self, probs):

        if probs.sum() > 1:
            probs.div_(probs.sum())
        K = len(probs)
        self.prob = torch.zeros(K)
        self.alias = torch.LongTensor([0] * K)

        # Sort the data into the outcomes with probabilities
        # that are larger and smaller than 1/K.
        smaller = []
        larger = []
        for kk, prob in enumerate(probs):
            self.prob[kk] = K * prob
            if self.prob[kk] < 1.0:
                smaller.append(kk)
            else:
                larger.append(kk)

        # Loop though and create little binary mixtures that
        # appropriately allocate the larger outcomes over the
        # overall uniform mixture.
        while len(smaller) > 0 and len(larger) > 0:
            small = smaller.pop()
            large = larger.pop()

            self.alias[small] = large
            self.prob[large] = (self.prob[large] - 1.0) + self.prob[small]

            if self.prob[large] < 1.0:
                smaller.append(large)
            else:
                larger.append(large)

        for last_one in smaller + larger:
            self.prob[last_one] = 1

    def cuda(self):
        self.prob = self.prob.cuda()
        self.alias = self.alias.cuda()

    def draw(self, N):
        """Draw N samples from multinomial"""
        K = self.alias.size(0)

        kk = torch.zeros(N, dtype=torch.long, device=self.prob.device).random_(0, K)
        prob = self.prob.index_select(0, kk)
        alias = self.alias.index_select(0, kk)
        # b is whether a random number is greater than q
        b = torch.bernoulli(prob)
        oq = kk.mul(b.long())
        oj = alias.mul((1 - b).long())

        return oq + oj


================================================
FILE: mdistiller/distillers/DKD.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from ._base import Distiller


def dkd_loss(logits_student, logits_teacher, target, alpha, beta, temperature):
    gt_mask = _get_gt_mask(logits_student, target)
    other_mask = _get_other_mask(logits_student, target)
    pred_student = F.softmax(logits_student / temperature, dim=1)
    pred_teacher = F.softmax(logits_teacher / temperature, dim=1)
    pred_student = cat_mask(pred_student, gt_mask, other_mask)
    pred_teacher = cat_mask(pred_teacher, gt_mask, other_mask)
    log_pred_student = torch.log(pred_student)
    tckd_loss = (
        F.kl_div(log_pred_student, pred_teacher, size_average=False)
        * (temperature**2)
        / target.shape[0]
    )
    pred_teacher_part2 = F.softmax(
        logits_teacher / temperature - 1000.0 * gt_mask, dim=1
    )
    log_pred_student_part2 = F.log_softmax(
        logits_student / temperature - 1000.0 * gt_mask, dim=1
    )
    nckd_loss = (
        F.kl_div(log_pred_student_part2, pred_teacher_part2, size_average=False)
        * (temperature**2)
        / target.shape[0]
    )
    return alpha * tckd_loss + beta * nckd_loss


def _get_gt_mask(logits, target):
    target = target.reshape(-1)
    mask = torch.zeros_like(logits).scatter_(1, target.unsqueeze(1), 1).bool()
    return mask


def _get_other_mask(logits, target):
    target = target.reshape(-1)
    mask = torch.ones_like(logits).scatter_(1, target.unsqueeze(1), 0).bool()
    return mask


def cat_mask(t, mask1, mask2):
    t1 = (t * mask1).sum(dim=1, keepdims=True)
    t2 = (t * mask2).sum(1, keepdims=True)
    rt = torch.cat([t1, t2], dim=1)
    return rt


class DKD(Distiller):
    """Decoupled Knowledge Distillation(CVPR 2022)"""

    def __init__(self, student, teacher, cfg):
        super(DKD, self).__init__(student, teacher)
        self.ce_loss_weight = cfg.DKD.CE_WEIGHT
        self.alpha = cfg.DKD.ALPHA
        self.beta = cfg.DKD.BETA
        self.temperature = cfg.DKD.T
        self.warmup = cfg.DKD.WARMUP

    def forward_train(self, image, target, **kwargs):
        logits_student, _ = self.student(image)
        with torch.no_grad():
            logits_teacher, _ = self.teacher(image)

        # losses
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_dkd = min(kwargs["epoch"] / self.warmup, 1.0) * dkd_loss(
            logits_student,
            logits_teacher,
            target,
            self.alpha,
            self.beta,
            self.temperature,
        )
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_dkd,
        }
        return logits_student, losses_dict


================================================
FILE: mdistiller/distillers/FitNet.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from ._base import Distiller
from ._common import ConvReg, get_feat_shapes


class FitNet(Distiller):
    """FitNets: Hints for Thin Deep Nets"""

    def __init__(self, student, teacher, cfg):
        super(FitNet, self).__init__(student, teacher)
        self.ce_loss_weight = cfg.FITNET.LOSS.CE_WEIGHT
        self.feat_loss_weight = cfg.FITNET.LOSS.FEAT_WEIGHT
        self.hint_layer = cfg.FITNET.HINT_LAYER
        feat_s_shapes, feat_t_shapes = get_feat_shapes(
            self.student, self.teacher, cfg.FITNET.INPUT_SIZE
        )
        self.conv_reg = ConvReg(
            feat_s_shapes[self.hint_layer], feat_t_shapes[self.hint_layer]
        )

    def get_learnable_parameters(self):
        return super().get_learnable_parameters() + list(self.conv_reg.parameters())

    def get_extra_parameters(self):
        num_p = 0
        for p in self.conv_reg.parameters():
            num_p += p.numel()
        return num_p

    def forward_train(self, image, target, **kwargs):
        logits_student, feature_student = self.student(image)
        with torch.no_grad():
            _, feature_teacher = self.teacher(image)

        # losses
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        f_s = self.conv_reg(feature_student["feats"][self.hint_layer])
        loss_feat = self.feat_loss_weight * F.mse_loss(
            f_s, feature_teacher["feats"][self.hint_layer]
        )
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_feat,
        }
        return logits_student, losses_dict


================================================
FILE: mdistiller/distillers/KD.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from ._base import Distiller


def kd_loss(logits_student, logits_teacher, temperature):
    log_pred_student = F.log_softmax(logits_student / temperature, dim=1)
    pred_teacher = F.softmax(logits_teacher / temperature, dim=1)
    loss_kd = F.kl_div(log_pred_student, pred_teacher, reduction="none").sum(1).mean()
    loss_kd *= temperature**2
    return loss_kd


class KD(Distiller):
    """Distilling the Knowledge in a Neural Network"""

    def __init__(self, student, teacher, cfg):
        super(KD, self).__init__(student, teacher)
        self.temperature = cfg.KD.TEMPERATURE
        self.ce_loss_weight = cfg.KD.LOSS.CE_WEIGHT
        self.kd_loss_weight = cfg.KD.LOSS.KD_WEIGHT

    def forward_train(self, image, target, **kwargs):
        logits_student, _ = self.student(image)
        with torch.no_grad():
            logits_teacher, _ = self.teacher(image)

        # losses
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_kd = self.kd_loss_weight * kd_loss(
            logits_student, logits_teacher, self.temperature
        )
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_kd,
        }
        return logits_student, losses_dict


================================================
FILE: mdistiller/distillers/KDSVD.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from ._base import Distiller


def kdsvd_loss(g_s, g_t, k):
    v_sb = None
    v_tb = None
    losses = []
    for i, f_s, f_t in zip(range(len(g_s)), g_s, g_t):
        u_t, s_t, v_t = svd(f_t, k)
        u_s, s_s, v_s = svd(f_s, k + 3)
        v_s, v_t = align_rsv(v_s, v_t)
        s_t = s_t.unsqueeze(1)
        v_t = v_t * s_t
        v_s = v_s * s_t

        if i > 0:
            s_rbf = torch.exp(-(v_s.unsqueeze(2) - v_sb.unsqueeze(1)).pow(2) / 8)
            t_rbf = torch.exp(-(v_t.unsqueeze(2) - v_tb.unsqueeze(1)).pow(2) / 8)

            l2loss = (s_rbf - t_rbf.detach()).pow(2)
            l2loss = torch.where(
                torch.isfinite(l2loss), l2loss, torch.zeros_like(l2loss)
            )
            losses.append(l2loss.sum())

        v_tb = v_t
        v_sb = v_s

    bsz = g_s[0].shape[0]
    losses = [l / bsz for l in losses]
    return sum(losses)


def svd(feat, n=1):
    size = feat.shape
    assert len(size) == 4

    x = feat.view(size[0], size[1] * size[2], size[3]).float()
    u, s, v = torch.svd(x)

    u = removenan(u)
    s = removenan(s)
    v = removenan(v)

    if n > 0:
        u = F.normalize(u[:, :, :n], dim=1)
        s = F.normalize(s[:, :n], dim=1)
        v = F.normalize(v[:, :, :n], dim=1)

    return u, s, v


def removenan(x):
    x = torch.where(torch.isfinite(x), x, torch.zeros_like(x))
    return x


def align_rsv(a, b):
    cosine = torch.matmul(a.transpose(-2, -1), b)
    max_abs_cosine, _ = torch.max(torch.abs(cosine), 1, keepdim=True)
    mask = torch.where(
        torch.eq(max_abs_cosine, torch.abs(cosine)),
        torch.sign(cosine),
        torch.zeros_like(cosine),
    )
    a = torch.matmul(a, mask)
    return a, b


class KDSVD(Distiller):
    """
    Self-supervised Knowledge Distillation using Singular Value Decomposition
    original Tensorflow code: https://github.com/sseung0703/SSKD_SVD
    """

    def __init__(self, student, teacher, cfg):
        super(KDSVD, self).__init__(student, teacher)
        self.k = cfg.KDSVD.K
        self.ce_loss_weight = cfg.KDSVD.LOSS.CE_WEIGHT
        self.feat_loss_weight = cfg.KDSVD.LOSS.FEAT_WEIGHT

    def forward_train(self, image, target, **kwargs):
        logits_student, feature_student = self.student(image)
        with torch.no_grad():
            _, feature_teacher = self.teacher(image)
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_feat = self.feat_loss_weight * kdsvd_loss(
            feature_student["feats"][1:], feature_teacher["feats"][1:], self.k
        )
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_feat,
        }
        return logits_student, losses_dict


================================================
FILE: mdistiller/distillers/NST.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from ._base import Distiller


def nst_loss(g_s, g_t):
    return sum([single_stage_nst_loss(f_s, f_t) for f_s, f_t in zip(g_s, g_t)])


def single_stage_nst_loss(f_s, f_t):
    s_H, t_H = f_s.shape[2], f_t.shape[2]
    if s_H > t_H:
        f_s = F.adaptive_avg_pool2d(f_s, (t_H, t_H))
    elif s_H < t_H:
        f_t = F.adaptive_avg_pool2d(f_t, (s_H, s_H))

    f_s = f_s.view(f_s.shape[0], f_s.shape[1], -1)
    f_s = F.normalize(f_s, dim=2)
    f_t = f_t.view(f_t.shape[0], f_t.shape[1], -1)
    f_t = F.normalize(f_t, dim=2)

    return (
        poly_kernel(f_t, f_t).mean().detach()
        + poly_kernel(f_s, f_s).mean()
        - 2 * poly_kernel(f_s, f_t).mean()
    )


def poly_kernel(a, b):
    a = a.unsqueeze(1)
    b = b.unsqueeze(2)
    res = (a * b).sum(-1).pow(2)
    return res


class NST(Distiller):
    """
    Like What You Like: Knowledge Distill via Neuron Selectivity Transfer
    """

    def __init__(self, student, teacher, cfg):
        super(NST, self).__init__(student, teacher)
        self.ce_loss_weight = cfg.NST.LOSS.CE_WEIGHT
        self.feat_loss_weight = cfg.NST.LOSS.FEAT_WEIGHT

    def forward_train(self, image, target, **kwargs):
        logits_student, feature_student = self.student(image)
        with torch.no_grad():
            _, feature_teacher = self.teacher(image)

        # losses
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_feat = self.feat_loss_weight * nst_loss(
            feature_student["feats"][1:], feature_teacher["feats"][1:]
        )
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_feat,
        }
        return logits_student, losses_dict


================================================
FILE: mdistiller/distillers/OFD.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.stats import norm
import numpy as np
import math

from ._base import Distiller


def feat_loss(source, target, margin):
    margin = margin.to(source)
    loss = (
        (source - margin) ** 2 * ((source > margin) & (target <= margin)).float()
        + (source - target) ** 2
        * ((source > target) & (target > margin) & (target <= 0)).float()
        + (source - target) ** 2 * (target > 0).float()
    )
    return torch.abs(loss).mean(dim=0).sum()


class ConnectorConvBN(nn.Module):
    def __init__(self, s_channels, t_channels, kernel_size=1):
        super(ConnectorConvBN, self).__init__()
        self.s_channels = s_channels
        self.t_channels = t_channels
        self.connectors = nn.ModuleList(
            self._make_conenctors(s_channels, t_channels, kernel_size)
        )

    def _make_conenctors(self, s_channels, t_channels, kernel_size):
        assert len(s_channels) == len(t_channels), "unequal length of feat list"
        connectors = nn.ModuleList(
            [
                self._build_feature_connector(t, s, kernel_size)
                for t, s in zip(t_channels, s_channels)
            ]
        )
        return connectors

    def _build_feature_connector(self, t_channel, s_channel, kernel_size):
        C = [
            nn.Conv2d(
                s_channel,
                t_channel,
                kernel_size=kernel_size,
                stride=1,
                padding=(kernel_size - 1) // 2,
                bias=False,
            ),
            nn.BatchNorm2d(t_channel),
        ]
        for m in C:
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2.0 / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
        return nn.Sequential(*C)

    def forward(self, g_s):
        out = []
        for i in range(len(g_s)):
            out.append(self.connectors[i](g_s[i]))

        return out


class OFD(Distiller):
    def __init__(self, student, teacher, cfg):
        super(OFD, self).__init__(student, teacher)
        self.ce_loss_weight = cfg.OFD.LOSS.CE_WEIGHT
        self.feat_loss_weight = cfg.OFD.LOSS.FEAT_WEIGHT
        self.init_ofd_modules(
            tea_channels=self.teacher.get_stage_channels()[1:],
            stu_channels=self.student.get_stage_channels()[1:],
            bn_before_relu=self.teacher.get_bn_before_relu(),
            kernel_size=cfg.OFD.CONNECTOR.KERNEL_SIZE,
        )

    def init_ofd_modules(
        self, tea_channels, stu_channels, bn_before_relu, kernel_size=1
    ):
        tea_channels, stu_channels = self._align_list(tea_channels, stu_channels)
        self.connectors = ConnectorConvBN(
            stu_channels, tea_channels, kernel_size=kernel_size
        )

        self.margins = []
        for idx, bn in enumerate(bn_before_relu):
            margin = []
            std = bn.weight.data
            mean = bn.bias.data
            for (s, m) in zip(std, mean):
                s = abs(s.item())
                m = m.item()
                if norm.cdf(-m / s) > 0.001:
                    margin.append(
                        -s
                        * math.exp(-((m / s) ** 2) / 2)
                        / math.sqrt(2 * math.pi)
                        / norm.cdf(-m / s)
                        + m
                    )
                else:
                    margin.append(-3 * s)
            margin = torch.FloatTensor(margin).to(std.device)
            self.margins.append(margin.unsqueeze(1).unsqueeze(2).unsqueeze(0).detach())

    def get_learnable_parameters(self):
        return super().get_learnable_parameters() + list(self.connectors.parameters())

    def train(self, mode=True):
        # teacher as eval mode by default
        if not isinstance(mode, bool):
            raise ValueError("training mode is expected to be boolean")
        self.training = mode
        for module in self.children():
            module.train(mode)
        return self

    def get_extra_parameters(self):
        num_p = 0
        for p in self.connectors.parameters():
            num_p += p.numel()
        return num_p

    def forward_train(self, image, target, **kwargs):
        logits_student, feature_student = self.student(image)
        with torch.no_grad():
            _, feature_teacher = self.teacher(image)

        # losses
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_feat = self.feat_loss_weight * self.ofd_loss(
            feature_student["preact_feats"][1:], feature_teacher["preact_feats"][1:]
        )
        losses_dict = {"loss_ce": loss_ce, "loss_kd": loss_feat}
        return logits_student, losses_dict

    def ofd_loss(self, feature_student, feature_teacher):
        feature_student, feature_teacher = self._align_list(
            feature_student, feature_teacher
        )
        feature_student = [
            self.connectors.connectors[idx](feat)
            for idx, feat in enumerate(feature_student)
        ]

        loss_distill = 0
        feat_num = len(feature_student)
        for i in range(feat_num):
            loss_distill = loss_distill + feat_loss(
                feature_student[i],
                F.adaptive_avg_pool2d(
                    feature_teacher[i], feature_student[i].shape[-2:]
                ).detach(),
                self.margins[i],
            ) / 2 ** (feat_num - i - 1)
        return loss_distill

    def _align_list(self, *input_list):
        min_len = min([len(l) for l in input_list])
        return [l[-min_len:] for l in input_list]


================================================
FILE: mdistiller/distillers/PKT.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from ._base import Distiller


def pkt_loss(f_s, f_t, eps=1e-7):
    # Normalize each vector by its norm
    output_net_norm = torch.sqrt(torch.sum(f_s**2, dim=1, keepdim=True))
    f_s = f_s / (output_net_norm + eps)
    f_s[f_s != f_s] = 0
    target_net_norm = torch.sqrt(torch.sum(f_t**2, dim=1, keepdim=True))
    f_t = f_t / (target_net_norm + eps)
    f_t[f_t != f_t] = 0

    # Calculate the cosine similarity
    model_similarity = torch.mm(f_s, f_s.transpose(0, 1))
    target_similarity = torch.mm(f_t, f_t.transpose(0, 1))
    # Scale cosine similarity to 0..1
    model_similarity = (model_similarity + 1.0) / 2.0
    target_similarity = (target_similarity + 1.0) / 2.0
    # Transform them into probabilities
    model_similarity = model_similarity / torch.sum(
        model_similarity, dim=1, keepdim=True
    )
    target_similarity = target_similarity / torch.sum(
        target_similarity, dim=1, keepdim=True
    )
    # Calculate the KL-divergence
    loss = torch.mean(
        target_similarity
        * torch.log((target_similarity + eps) / (model_similarity + eps))
    )
    return loss


class PKT(Distiller):
    """
    Probabilistic Knowledge Transfer for deep representation learning
    Code from: https://github.com/passalis/probabilistic_kt
    """

    def __init__(self, student, teacher, cfg):
        super(PKT, self).__init__(student, teacher)
        self.ce_loss_weight = cfg.PKT.LOSS.CE_WEIGHT
        self.feat_loss_weight = cfg.PKT.LOSS.FEAT_WEIGHT

    def forward_train(self, image, target, **kwargs):
        logits_student, feature_student = self.student(image)
        with torch.no_grad():
            _, feature_teacher = self.teacher(image)

        # lossess
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_feat = self.feat_loss_weight * pkt_loss(
            feature_student["pooled_feat"], feature_teacher["pooled_feat"]
        )
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_feat,
        }
        return logits_student, losses_dict


================================================
FILE: mdistiller/distillers/RKD.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from ._base import Distiller


def _pdist(e, squared, eps):
    e_square = e.pow(2).sum(dim=1)
    prod = e @ e.t()
    res = (e_square.unsqueeze(1) + e_square.unsqueeze(0) - 2 * prod).clamp(min=eps)

    if not squared:
        res = res.sqrt()

    res = res.clone()
    res[range(len(e)), range(len(e))] = 0
    return res


def rkd_loss(f_s, f_t, squared=False, eps=1e-12, distance_weight=25, angle_weight=50):
    stu = f_s.view(f_s.shape[0], -1)
    tea = f_t.view(f_t.shape[0], -1)

    # RKD distance loss
    with torch.no_grad():
        t_d = _pdist(tea, squared, eps)
        mean_td = t_d[t_d > 0].mean()
        t_d = t_d / mean_td

    d = _pdist(stu, squared, eps)
    mean_d = d[d > 0].mean()
    d = d / mean_d

    loss_d = F.smooth_l1_loss(d, t_d)

    # RKD Angle loss
    with torch.no_grad():
        td = tea.unsqueeze(0) - tea.unsqueeze(1)
        norm_td = F.normalize(td, p=2, dim=2)
        t_angle = torch.bmm(norm_td, norm_td.transpose(1, 2)).view(-1)

    sd = stu.unsqueeze(0) - stu.unsqueeze(1)
    norm_sd = F.normalize(sd, p=2, dim=2)
    s_angle = torch.bmm(norm_sd, norm_sd.transpose(1, 2)).view(-1)

    loss_a = F.smooth_l1_loss(s_angle, t_angle)

    loss = distance_weight * loss_d + angle_weight * loss_a
    return loss


class RKD(Distiller):
    """Relational Knowledge Disitllation, CVPR2019"""

    def __init__(self, student, teacher, cfg):
        super(RKD, self).__init__(student, teacher)
        self.distance_weight = cfg.RKD.DISTANCE_WEIGHT
        self.angle_weight = cfg.RKD.ANGLE_WEIGHT
        self.ce_loss_weight = cfg.RKD.LOSS.CE_WEIGHT
        self.feat_loss_weight = cfg.RKD.LOSS.FEAT_WEIGHT
        self.eps = cfg.RKD.PDIST.EPSILON
        self.squared = cfg.RKD.PDIST.SQUARED

    def forward_train(self, image, target, **kwargs):
        logits_student, feature_student = self.student(image)
        with torch.no_grad():
            _, feature_teacher = self.teacher(image)

        # losses
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_rkd = self.feat_loss_weight * rkd_loss(
            feature_student["pooled_feat"],
            feature_teacher["pooled_feat"],
            self.squared,
            self.eps,
            self.distance_weight,
            self.angle_weight,
        )
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_rkd,
        }
        return logits_student, losses_dict


================================================
FILE: mdistiller/distillers/ReviewKD.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import pdb

from ._base import Distiller


def hcl_loss(fstudent, fteacher):
    loss_all = 0.0
    for fs, ft in zip(fstudent, fteacher):
        n, c, h, w = fs.shape
        loss = F.mse_loss(fs, ft, reduction="mean")
        cnt = 1.0
        tot = 1.0
        for l in [4, 2, 1]:
            if l >= h:
                continue
            tmpfs = F.adaptive_avg_pool2d(fs, (l, l))
            tmpft = F.adaptive_avg_pool2d(ft, (l, l))
            cnt /= 2.0
            loss += F.mse_loss(tmpfs, tmpft, reduction="mean") * cnt
            tot += cnt
        loss = loss / tot
        loss_all = loss_all + loss
    return loss_all


class ReviewKD(Distiller):
    def __init__(self, student, teacher, cfg):
        super(ReviewKD, self).__init__(student, teacher)
        self.shapes = cfg.REVIEWKD.SHAPES
        self.out_shapes = cfg.REVIEWKD.OUT_SHAPES
        in_channels = cfg.REVIEWKD.IN_CHANNELS
        out_channels = cfg.REVIEWKD.OUT_CHANNELS
        self.ce_loss_weight = cfg.REVIEWKD.CE_WEIGHT
        self.reviewkd_loss_weight = cfg.REVIEWKD.REVIEWKD_WEIGHT
        self.warmup_epochs = cfg.REVIEWKD.WARMUP_EPOCHS
        self.stu_preact = cfg.REVIEWKD.STU_PREACT
        self.max_mid_channel = cfg.REVIEWKD.MAX_MID_CHANNEL

        abfs = nn.ModuleList()
        mid_channel = min(512, in_channels[-1])
        for idx, in_channel in enumerate(in_channels):
            abfs.append(
                ABF(
                    in_channel,
                    mid_channel,
                    out_channels[idx],
                    idx < len(in_channels) - 1,
                )
            )
        self.abfs = abfs[::-1]

    def get_learnable_parameters(self):
        return super().get_learnable_parameters() + list(self.abfs.parameters())

    def get_extra_parameters(self):
        num_p = 0
        for p in self.abfs.parameters():
            num_p += p.numel()
        return num_p

    def forward_train(self, image, target, **kwargs):
        logits_student, features_student = self.student(image)
        with torch.no_grad():
            logits_teacher, features_teacher = self.teacher(image)

        # get features
        if self.stu_preact:
            x = features_student["preact_feats"] + [
                features_student["pooled_feat"].unsqueeze(-1).unsqueeze(-1)
            ]
        else:
            x = features_student["feats"] + [
                features_student["pooled_feat"].unsqueeze(-1).unsqueeze(-1)
            ]
        x = x[::-1]
        results = []
        out_features, res_features = self.abfs[0](x[0], out_shape=self.out_shapes[0])
        results.append(out_features)
        for features, abf, shape, out_shape in zip(
            x[1:], self.abfs[1:], self.shapes[1:], self.out_shapes[1:]
        ):
            out_features, res_features = abf(features, res_features, shape, out_shape)
            results.insert(0, out_features)
        features_teacher = features_teacher["preact_feats"][1:] + [
            features_teacher["pooled_feat"].unsqueeze(-1).unsqueeze(-1)
        ]
        # losses
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_reviewkd = (
            self.reviewkd_loss_weight
            * min(kwargs["epoch"] / self.warmup_epochs, 1.0)
            * hcl_loss(results, features_teacher)
        )
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_reviewkd,
        }
        return logits_student, losses_dict


class ABF(nn.Module):
    def __init__(self, in_channel, mid_channel, out_channel, fuse):
        super(ABF, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channel, mid_channel, kernel_size=1, bias=False),
            nn.BatchNorm2d(mid_channel),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                mid_channel, out_channel, kernel_size=3, stride=1, padding=1, bias=False
            ),
            nn.BatchNorm2d(out_channel),
        )
        if fuse:
            self.att_conv = nn.Sequential(
                nn.Conv2d(mid_channel * 2, 2, kernel_size=1),
                nn.Sigmoid(),
            )
        else:
            self.att_conv = None
        nn.init.kaiming_uniform_(self.conv1[0].weight, a=1)  # pyre-ignore
        nn.init.kaiming_uniform_(self.conv2[0].weight, a=1)  # pyre-ignore

    def forward(self, x, y=None, shape=None, out_shape=None):
        n, _, h, w = x.shape
        # transform student features
        x = self.conv1(x)
        if self.att_conv is not None:
            # upsample residual features
            y = F.interpolate(y, (shape, shape), mode="nearest")
            # fusion
            z = torch.cat([x, y], dim=1)
            z = self.att_conv(z)
            x = x * z[:, 0].view(n, 1, h, w) + y * z[:, 1].view(n, 1, h, w)
        # output
        if x.shape[-1] != out_shape:
            x = F.interpolate(x, (out_shape, out_shape), mode="nearest")
        y = self.conv2(x)
        return y, x


================================================
FILE: mdistiller/distillers/SP.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from ._base import Distiller


def sp_loss(g_s, g_t):
    return sum([similarity_loss(f_s, f_t) for f_s, f_t in zip(g_s, g_t)])


def similarity_loss(f_s, f_t):
    bsz = f_s.shape[0]
    f_s = f_s.view(bsz, -1)
    f_t = f_t.view(bsz, -1)

    G_s = torch.mm(f_s, torch.t(f_s))
    G_s = torch.nn.functional.normalize(G_s)
    G_t = torch.mm(f_t, torch.t(f_t))
    G_t = torch.nn.functional.normalize(G_t)

    G_diff = G_t - G_s
    loss = (G_diff * G_diff).view(-1, 1).sum(0) / (bsz * bsz)
    return loss


class SP(Distiller):
    """Similarity-Preserving Knowledge Distillation, ICCV2019"""

    def __init__(self, student, teacher, cfg):
        super(SP, self).__init__(student, teacher)
        self.ce_loss_weight = cfg.SP.LOSS.CE_WEIGHT
        self.feat_loss_weight = cfg.SP.LOSS.FEAT_WEIGHT

    def forward_train(self, image, target, **kwargs):
        logits_student, feature_student = self.student(image)
        with torch.no_grad():
            _, feature_teacher = self.teacher(image)

        # losses
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_feat = self.feat_loss_weight * sp_loss(
            [feature_student["feats"][-1]], [feature_teacher["feats"][-1]]
        )
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_feat,
        }
        return logits_student, losses_dict


================================================
FILE: mdistiller/distillers/VID.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from ._base import Distiller
from ._common import get_feat_shapes


def conv1x1(in_channels, out_channels, stride=1):
    return nn.Conv2d(
        in_channels, out_channels, kernel_size=1, padding=0, bias=False, stride=stride
    )


def vid_loss(regressor, log_scale, f_s, f_t, eps=1e-5):
    # pool for dimentsion match
    s_H, t_H = f_s.shape[2], f_t.shape[2]
    if s_H > t_H:
        f_s = F.adaptive_avg_pool2d(f_s, (t_H, t_H))
    elif s_H < t_H:
        f_t = F.adaptive_avg_pool2d(f_t, (s_H, s_H))
    else:
        pass
    pred_mean = regressor(f_s)
    pred_var = torch.log(1.0 + torch.exp(log_scale)) + eps
    pred_var = pred_var.view(1, -1, 1, 1).to(pred_mean)
    neg_log_prob = 0.5 * ((pred_mean - f_t) ** 2 / pred_var + torch.log(pred_var))
    loss = torch.mean(neg_log_prob)
    return loss


class VID(Distiller):
    """
    Variational Information Distillation for Knowledge Transfer (CVPR 2019),
    code from author: https://github.com/ssahn0215/variational-information-distillation
    """

    def __init__(self, student, teacher, cfg):
        super(VID, self).__init__(student, teacher)
        self.ce_loss_weight = cfg.VID.LOSS.CE_WEIGHT
        self.feat_loss_weight = cfg.VID.LOSS.FEAT_WEIGHT
        self.init_pred_var = cfg.VID.INIT_PRED_VAR
        self.eps = cfg.VID.EPS
        feat_s_shapes, feat_t_shapes = get_feat_shapes(
            self.student, self.teacher, cfg.VID.INPUT_SIZE
        )
        feat_s_channels = [s[1] for s in feat_s_shapes[1:]]
        feat_t_channels = [s[1] for s in feat_t_shapes[1:]]
        self.init_vid_modules(feat_s_channels, feat_t_channels)

    def init_vid_modules(self, feat_s_shapes, feat_t_shapes):
        self.regressors = nn.ModuleList()
        self.log_scales = []
        for s, t in zip(feat_s_shapes, feat_t_shapes):
            regressor = nn.Sequential(
                conv1x1(s, t), nn.ReLU(), conv1x1(t, t), nn.ReLU(), conv1x1(t, t)
            )
            self.regressors.append(regressor)
            log_scale = torch.nn.Parameter(
                np.log(np.exp(self.init_pred_var - self.eps) - 1.0) * torch.ones(t)
            )
            self.log_scales.append(log_scale)

    def get_learnable_parameters(self):
        parameters = super().get_learnable_parameters()
        for regressor in self.regressors:
            parameters += list(regressor.parameters())
        return parameters

    def get_extra_parameters(self):
        num_p = 0
        for regressor in self.regressors:
            for p in regressor.parameters():
                num_p += p.numel()
        return num_p

    def forward_train(self, image, target, **kwargs):
        logits_student, feature_student = self.student(image)
        with torch.no_grad():
            _, feature_teacher = self.teacher(image)

        # losses
        loss_ce = self.ce_loss_weight * F.cross_entropy(logits_student, target)
        loss_vid = 0
        for i in range(len(feature_student["feats"][1:])):
            loss_vid += vid_loss(
                self.regressors[i],
                self.log_scales[i],
                feature_student["feats"][1:][i],
                feature_teacher["feats"][1:][i],
                self.eps,
            )
        loss_vid = self.feat_loss_weight * loss_vid
        losses_dict = {
            "loss_ce": loss_ce,
            "loss_kd": loss_vid,
        }
        return logits_student, losses_dict


================================================
FILE: mdistiller/distillers/__init__.py
================================================
from ._base import Vanilla
from .KD import KD
from .AT import AT
from .OFD import OFD
from .RKD import RKD
from .FitNet import FitNet
from .KDSVD import KDSVD
from .CRD import CRD
from .NST import NST
from .PKT import PKT
from .SP import SP
from .VID import VID
from .ReviewKD import ReviewKD
from .DKD import DKD

distiller_dict = {
    "NONE": Vanilla,
    "KD": KD,
    "AT": AT,
    "OFD": OFD,
    "RKD": RKD,
    "FITNET": FitNet,
    "KDSVD": KDSVD,
    "CRD": CRD,
    "NST": NST,
    "PKT": PKT,
    "SP": SP,
    "VID": VID,
    "REVIEWKD": ReviewKD,
    "DKD": DKD,
}


================================================
FILE: mdistiller/distillers/_base.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


class Distiller(nn.Module):
    def __init__(self, student, teacher):
        super(Distiller, self).__init__()
        self.student = student
        self.teacher = teacher

    def train(self, mode=True):
        # teacher as eval mode by default
        if not isinstance(mode, bool):
            raise ValueError("training mode is expected to be boolean")
        self.training = mode
        for module in self.children():
            module.train(mode)
        self.teacher.eval()
        return self

    def get_learnable_parameters(self):
        # if the method introduces extra parameters, re-impl this function
        return [v for k, v in self.student.named_parameters()]

    def get_extra_parameters(self):
        # calculate the extra parameters introduced by the distiller
        return 0

    def forward_train(self, **kwargs):
        # training function for the distillation method
        raise NotImplementedError()

    def forward_test(self, image):
        return self.student(image)[0]

    def forward(self, **kwargs):
        if self.training:
            return self.forward_train(**kwargs)
        return self.forward_test(kwargs["image"])


class Vanilla(nn.Module):
    def __init__(self, student):
        super(Vanilla, self).__init__()
        self.student = student

    def get_learnable_parameters(self):
        return [v for k, v in self.student.named_parameters()]

    def forward_train(self, image, target, **kwargs):
        logits_student, _ = self.student(image)
        loss = F.cross_entropy(logits_student, target)
        return logits_student, {"ce": loss}

    def forward(self, **kwargs):
        if self.training:
            return self.forward_train(**kwargs)
        return self.forward_test(kwargs["image"])

    def forward_test(self, image):
        return self.student(image)[0]


================================================
FILE: mdistiller/distillers/_common.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F


class ConvReg(nn.Module):
    """Convolutional regression"""

    def __init__(self, s_shape, t_shape, use_relu=True):
        super(ConvReg, self).__init__()
        self.use_relu = use_relu
        s_N, s_C, s_H, s_W = s_shape
        t_N, t_C, t_H, t_W = t_shape
        if s_H == 2 * t_H:
            self.conv = nn.Conv2d(s_C, t_C, kernel_size=3, stride=2, padding=1)
        elif s_H * 2 == t_H:
            self.conv = nn.ConvTranspose2d(s_C, t_C, kernel_size=4, stride=2, padding=1)
        elif s_H >= t_H:
            self.conv = nn.Conv2d(s_C, t_C, kernel_size=(1 + s_H - t_H, 1 + s_W - t_W))
        else:
            raise NotImplemented("student size {}, teacher size {}".format(s_H, t_H))
        self.bn = nn.BatchNorm2d(t_C)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.conv(x)
        if self.use_relu:
            return self.relu(self.bn(x))
        else:
            return self.bn(x)


def get_feat_shapes(student, teacher, input_size):
    data = torch.randn(1, 3, *input_size)
    with torch.no_grad():
        _, feat_s = student(data)
        _, feat_t = teacher(data)
    feat_s_shapes = [f.shape for f in feat_s["feats"]]
    feat_t_shapes = [f.shape for f in feat_t["feats"]]
    return feat_s_shapes, feat_t_shapes


================================================
FILE: mdistiller/engine/__init__.py
================================================
from .trainer import BaseTrainer, CRDTrainer, DOT, CRDDOT
trainer_dict = {
    "base": BaseTrainer,
    "crd": CRDTrainer,
    "dot": DOT,
    "crd_dot": CRDDOT,
}


================================================
FILE: mdistiller/engine/cfg.py
================================================
from yacs.config import CfgNode as CN
from .utils import log_msg


def show_cfg(cfg):
    dump_cfg = CN()
    dump_cfg.EXPERIMENT = cfg.EXPERIMENT
    dump_cfg.DATASET = cfg.DATASET
    dump_cfg.DISTILLER = cfg.DISTILLER
    dump_cfg.SOLVER = cfg.SOLVER
    dump_cfg.LOG = cfg.LOG
    if cfg.DISTILLER.TYPE in cfg:
        dump_cfg.update({cfg.DISTILLER.TYPE: cfg.get(cfg.DISTILLER.TYPE)})
    print(log_msg("CONFIG:\n{}".format(dump_cfg.dump()), "INFO"))


CFG = CN()

# Experiment
CFG.EXPERIMENT = CN()
CFG.EXPERIMENT.PROJECT = "distill"
CFG.EXPERIMENT.NAME = ""
CFG.EXPERIMENT.TAG = "default"

# Dataset
CFG.DATASET = CN()
CFG.DATASET.TYPE = "cifar100"
CFG.DATASET.NUM_WORKERS = 2
CFG.DATASET.TEST = CN()
CFG.DATASET.TEST.BATCH_SIZE = 64

# Distiller
CFG.DISTILLER = CN()
CFG.DISTILLER.TYPE = "NONE"  # Vanilla as default
CFG.DISTILLER.TEACHER = "ResNet50"
CFG.DISTILLER.STUDENT = "resnet32"

# Solver
CFG.SOLVER = CN()
CFG.SOLVER.TRAINER = "base"
CFG.SOLVER.BATCH_SIZE = 64
CFG.SOLVER.EPOCHS = 240
CFG.SOLVER.LR = 0.05
CFG.SOLVER.LR_DECAY_STAGES = [150, 180, 210]
CFG.SOLVER.LR_DECAY_RATE = 0.1
CFG.SOLVER.WEIGHT_DECAY = 0.0001
CFG.SOLVER.MOMENTUM = 0.9
CFG.SOLVER.TYPE = "SGD"

# Log
CFG.LOG = CN()
CFG.LOG.TENSORBOARD_FREQ = 500
CFG.LOG.SAVE_CHECKPOINT_FREQ = 40
CFG.LOG.PREFIX = "./output"
CFG.LOG.WANDB = True

#

Download .txt

gitextract_b7kwa80j/

├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── configs/
│   ├── cifar100/
│   │   ├── at.yaml
│   │   ├── crd.yaml
│   │   ├── dkd/
│   │   │   ├── res110_res32.yaml
│   │   │   ├── res32x4_res8x4.yaml
│   │   │   ├── res32x4_shuv1.yaml
│   │   │   ├── res32x4_shuv2.yaml
│   │   │   ├── res50_mv2.yaml
│   │   │   ├── res56_res20.yaml
│   │   │   ├── vgg13_mv2.yaml
│   │   │   ├── vgg13_vgg8.yaml
│   │   │   ├── wrn40_2_shuv1.yaml
│   │   │   ├── wrn40_2_wrn_16_2.yaml
│   │   │   └── wrn40_2_wrn_40_1.yaml
│   │   ├── dot/
│   │   │   ├── res32x4_res8x4.yaml
│   │   │   ├── res32x4_shuv2.yaml
│   │   │   └── vgg13_vgg8.yaml
│   │   ├── fitnet.yaml
│   │   ├── kd.yaml
│   │   ├── kdsvd.yaml
│   │   ├── nst.yaml
│   │   ├── ofd.yaml
│   │   ├── pkt.yaml
│   │   ├── reviewkd.yaml
│   │   ├── rkd.yaml
│   │   ├── sp.yaml
│   │   ├── vanilla.yaml
│   │   └── vid.yaml
│   ├── imagenet/
│   │   ├── r34_r18/
│   │   │   ├── at.yaml
│   │   │   ├── crd.yaml
│   │   │   ├── dkd.yaml
│   │   │   ├── dot.yaml
│   │   │   ├── kd.yaml
│   │   │   └── reviewkd.yaml
│   │   └── r50_mv1/
│   │       ├── at.yaml
│   │       ├── crd.yaml
│   │       ├── dkd.yaml
│   │       ├── dot.yaml
│   │       ├── kd.yaml
│   │       ├── ofd.yaml
│   │       └── reviewkd.yaml
│   └── tiny_imagenet/
│       └── dot/
│           ├── r18_mv2.yaml
│           └── r18_shuv2.yaml
├── detection/
│   ├── README.md
│   ├── __init__.py
│   ├── configs/
│   │   ├── Base-Distillation.yaml
│   │   ├── DKD/
│   │   │   ├── DKD-MV2-R50.yaml
│   │   │   ├── DKD-R18-R101.yaml
│   │   │   ├── DKD-R50-R101.yaml
│   │   │   ├── ReviewDKD-MV2-R50.yaml
│   │   │   ├── ReviewDKD-R18-R101.yaml
│   │   │   └── ReviewDKD-R50-R101.yaml
│   │   └── ReviewKD/
│   │       ├── ReviewKD-MV2-R50-Mask.yaml
│   │       ├── ReviewKD-MV2-R50.yaml
│   │       ├── ReviewKD-R18-R101-Mask.yaml
│   │       ├── ReviewKD-R18-R101.yaml
│   │       ├── ReviewKD-R50-R101-Mask.yaml
│   │       └── ReviewKD-R50-R101.yaml
│   ├── model/
│   │   ├── __init__.py
│   │   ├── backbone/
│   │   │   ├── __init__.py
│   │   │   ├── fpn.py
│   │   │   ├── mobilenetv2.py
│   │   │   └── resnet.py
│   │   ├── config.py
│   │   ├── rcnn.py
│   │   ├── reviewkd.py
│   │   └── teacher/
│   │       ├── __init__.py
│   │       └── teacher.py
│   └── train_net.py
├── mdistiller/
│   ├── __init__.py
│   ├── dataset/
│   │   ├── __init__.py
│   │   ├── cifar100.py
│   │   ├── imagenet.py
│   │   └── tiny_imagenet.py
│   ├── distillers/
│   │   ├── AT.py
│   │   ├── CRD.py
│   │   ├── DKD.py
│   │   ├── FitNet.py
│   │   ├── KD.py
│   │   ├── KDSVD.py
│   │   ├── NST.py
│   │   ├── OFD.py
│   │   ├── PKT.py
│   │   ├── RKD.py
│   │   ├── ReviewKD.py
│   │   ├── SP.py
│   │   ├── VID.py
│   │   ├── __init__.py
│   │   ├── _base.py
│   │   └── _common.py
│   ├── engine/
│   │   ├── __init__.py
│   │   ├── cfg.py
│   │   ├── dot.py
│   │   ├── trainer.py
│   │   └── utils.py
│   └── models/
│       ├── __init__.py
│       ├── cifar/
│       │   ├── ShuffleNetv1.py
│       │   ├── ShuffleNetv2.py
│       │   ├── __init__.py
│       │   ├── mobilenetv2.py
│       │   ├── mv2_tinyimagenet.py
│       │   ├── resnet.py
│       │   ├── resnetv2.py
│       │   ├── vgg.py
│       │   └── wrn.py
│       └── imagenet/
│           ├── __init__.py
│           ├── mobilenetv1.py
│           └── resnet.py
├── requirements.txt
├── setup.py
└── tools/
    ├── eval.py
    ├── train.py
    └── visualizations/
        ├── correlation.ipynb
        └── tsne.ipynb

Download .txt

SYMBOL INDEX (408 symbols across 42 files)

FILE: detection/model/backbone/fpn.py
  function build_resnet_fpn_backbone_kd (line 10) | def build_resnet_fpn_backbone_kd(cfg, input_shape: ShapeSpec):
  function build_mobilenetv2_fpn_backbone (line 33) | def build_mobilenetv2_fpn_backbone(cfg, input_shape: ShapeSpec):

FILE: detection/model/backbone/mobilenetv2.py
  function _make_divisible (line 27) | def _make_divisible(v, divisor, min_value=None):
  function conv_3x3_bn (line 47) | def conv_3x3_bn(inp, oup, stride, bn):
  function conv_1x1_bn (line 55) | def conv_1x1_bn(inp, oup):
  class InvertedResidual (line 63) | class InvertedResidual(nn.Module):
    method __init__ (line 64) | def __init__(self, inp, oup, stride, expand_ratio, bn):
    method forward (line 96) | def forward(self, x):
    method freeze (line 102) | def freeze(self):
  class MobileNetV2 (line 110) | class MobileNetV2(Backbone):
    method __init__ (line 111) | def __init__(self, cfg, input_shape, width_mult = 1.):
    method forward (line 166) | def forward(self, x):
    method _initialize_weights (line 182) | def _initialize_weights(self):
    method output_shape (line 195) | def output_shape(self):
  function build_mobilenetv2_backbone (line 206) | def build_mobilenetv2_backbone(cfg, input_shape):

FILE: detection/model/backbone/resnet.py
  class ResNetBlockBase (line 30) | class ResNetBlockBase(nn.Module):
    method __init__ (line 31) | def __init__(self, in_channels, out_channels, stride):
    method freeze (line 45) | def freeze(self):
  class BasicBlock (line 52) | class BasicBlock(ResNetBlockBase):
    method __init__ (line 53) | def __init__(
    method forward (line 104) | def forward(self, x):
  class BottleneckBlock (line 119) | class BottleneckBlock(ResNetBlockBase):
    method __init__ (line 120) | def __init__(
    method forward (line 204) | def forward(self, x):
  class DeformBottleneckBlock (line 223) | class DeformBottleneckBlock(ResNetBlockBase):
    method __init__ (line 224) | def __init__(
    method forward (line 311) | def forward(self, x):
  function make_stage (line 338) | def make_stage(block_class, num_blocks, first_stride, **kwargs):
  class BasicStem (line 359) | class BasicStem(nn.Module):
    method __init__ (line 360) | def __init__(self, in_channels=3, out_channels=64, norm="BN"):
    method forward (line 379) | def forward(self, x):
    method out_channels (line 386) | def out_channels(self):
    method stride (line 390) | def stride(self):
  class ResNet (line 394) | class ResNet(Backbone):
    method __init__ (line 395) | def __init__(self, stem, stages, num_classes=None, out_features=None):
    method forward (line 446) | def forward(self, x):
    method output_shape (line 463) | def output_shape(self):
  function build_resnet_backbone_kd (line 473) | def build_resnet_backbone_kd(cfg, input_shape):

FILE: detection/model/config.py
  function add_distillation_cfg (line 4) | def add_distillation_cfg(cfg):
  function add_teacher_cfg (line 29) | def add_teacher_cfg(cfg):

FILE: detection/model/rcnn.py
  function rcnn_dkd_loss (line 26) | def rcnn_dkd_loss(stu_predictions, tea_predictions, gt_classes, alpha, b...
  class RCNNKD (line 36) | class RCNNKD(nn.Module):
    method __init__ (line 45) | def __init__(
    method from_config (line 95) | def from_config(cls, cfg):
    method device (line 114) | def device(self):
    method visualize_training (line 117) | def visualize_training(self, batched_inputs, proposals):
    method forward_pure_roi_head (line 152) | def forward_pure_roi_head(self, roi_head, features, proposals):
    method forward (line 159) | def forward(self, batched_inputs: Tuple[Dict[str, torch.Tensor]]):
    method inference (line 242) | def inference(
    method preprocess_image (line 288) | def preprocess_image(self, batched_inputs: Tuple[Dict[str, torch.Tenso...
    method teacher_preprocess_image (line 297) | def teacher_preprocess_image(self, batched_inputs: Tuple[Dict[str, tor...
    method _postprocess (line 309) | def _postprocess(instances, batched_inputs: Tuple[Dict[str, torch.Tens...

FILE: detection/model/reviewkd.py
  class ABF (line 5) | class ABF(nn.Module):
    method __init__ (line 6) | def __init__(self, in_channel, mid_channel, out_channel, fuse):
    method forward (line 26) | def forward(self, x, y=None, shape=None):
  class ReviewKD (line 42) | class ReviewKD(nn.Module):
    method __init__ (line 43) | def __init__(
    method forward (line 56) | def forward(self, student_features):
  function build_kd_trans (line 68) | def build_kd_trans(cfg):
  function hcl (line 75) | def hcl(fstudent, fteacher):

FILE: detection/model/teacher/teacher.py
  class Teacher (line 8) | class Teacher(nn.Module):
    method __init__ (line 9) | def __init__(self, backbone, proposal_generator, roi_heads):
  function build_teacher (line 15) | def build_teacher(cfg):

FILE: detection/train_net.py
  class Trainer (line 46) | class Trainer(DefaultTrainer):
    method build_evaluator (line 55) | def build_evaluator(cls, cfg, dataset_name, output_folder=None):
    method test_with_TTA (line 103) | def test_with_TTA(cls, cfg, model):
  function setup (line 120) | def setup(args):
  function main (line 133) | def main(args):

FILE: mdistiller/dataset/__init__.py
  function get_dataset (line 6) | def get_dataset(cfg):

FILE: mdistiller/dataset/cifar100.py
  function get_data_folder (line 8) | def get_data_folder():
  class CIFAR100Instance (line 15) | class CIFAR100Instance(datasets.CIFAR100):
    method __getitem__ (line 18) | def __getitem__(self, index):
  class CIFAR100InstanceSample (line 24) | class CIFAR100InstanceSample(datasets.CIFAR100):
    method __init__ (line 29) | def __init__(
    method __getitem__ (line 84) | def __getitem__(self, index):
  function get_cifar100_train_transform (line 117) | def get_cifar100_train_transform():
  function get_cifar100_test_transform (line 130) | def get_cifar100_test_transform():
  function get_cifar100_dataloaders (line 139) | def get_cifar100_dataloaders(batch_size, val_batch_size, num_workers):
  function get_cifar100_dataloaders_sample (line 164) | def get_cifar100_dataloaders_sample(

FILE: mdistiller/dataset/imagenet.py
  class ImageNet (line 11) | class ImageNet(ImageFolder):
    method __getitem__ (line 12) | def __getitem__(self, index):
  class ImageNetInstanceSample (line 17) | class ImageNetInstanceSample(ImageNet):
    method __init__ (line 20) | def __init__(self, folder, transform=None, target_transform=None,
    method __getitem__ (line 50) | def __getitem__(self, index):
  function get_imagenet_train_transform (line 68) | def get_imagenet_train_transform(mean, std):
  function get_imagenet_test_transform (line 80) | def get_imagenet_test_transform(mean, std):
  function get_imagenet_dataloaders (line 92) | def get_imagenet_dataloaders(batch_size, val_batch_size, num_workers,
  function get_imagenet_dataloaders_sample (line 103) | def get_imagenet_dataloaders_sample(batch_size, val_batch_size, num_work...
  function get_imagenet_val_loader (line 114) | def get_imagenet_val_loader(val_batch_size, mean=[0.485, 0.456, 0.406], ...

FILE: mdistiller/dataset/tiny_imagenet.py
  class ImageFolderInstance (line 13) | class ImageFolderInstance(datasets.ImageFolder):
    method __getitem__ (line 14) | def __getitem__(self, index):
  class ImageFolderInstanceSample (line 22) | class ImageFolderInstanceSample(ImageFolderInstance):
    method __init__ (line 25) | def __init__(self, folder, transform=None, target_transform=None,
    method __getitem__ (line 54) | def __getitem__(self, index):
  function get_tinyimagenet_dataloader (line 73) | def get_tinyimagenet_dataloader(batch_size, val_batch_size, num_workers):
  function get_tinyimagenet_dataloader_sample (line 99) | def get_tinyimagenet_dataloader_sample(batch_size, val_batch_size, num_w...

FILE: mdistiller/distillers/AT.py
  function single_stage_at_loss (line 8) | def single_stage_at_loss(f_s, f_t, p):
  function at_loss (line 20) | def at_loss(g_s, g_t, p):
  class AT (line 24) | class AT(Distiller):
    method __init__ (line 30) | def __init__(self, student, teacher, cfg):
    method forward_train (line 36) | def forward_train(self, image, target, **kwargs):

FILE: mdistiller/distillers/CRD.py
  class CRD (line 9) | class CRD(Distiller):
    method __init__ (line 12) | def __init__(self, student, teacher, cfg, num_data):
    method init_crd_modules (line 26) | def init_crd_modules(
    method get_learnable_parameters (line 42) | def get_learnable_parameters(self):
    method get_extra_parameters (line 49) | def get_extra_parameters(self):
    method crd_loss (line 60) | def crd_loss(self, f_s, f_t, idx, contrast_idx):
    method forward_train (line 68) | def forward_train(self, image, target, index, contrastive_index, **kwa...
  class Normalize (line 88) | class Normalize(nn.Module):
    method __init__ (line 91) | def __init__(self, power=2):
    method forward (line 95) | def forward(self, x):
  class Embed (line 101) | class Embed(nn.Module):
    method __init__ (line 104) | def __init__(self, dim_in=1024, dim_out=128):
    method forward (line 109) | def forward(self, x):
  class ContrastLoss (line 116) | class ContrastLoss(nn.Module):
    method __init__ (line 119) | def __init__(self, num_data):
    method forward (line 123) | def forward(self, x):
  class ContrastMemory (line 144) | class ContrastMemory(nn.Module):
    method __init__ (line 147) | def __init__(self, inputSize, output_size, K, T=0.07, momentum=0.5):
    method forward (line 164) | def forward(self, v1, v2, y, idx=None):
  class AliasMethod (line 223) | class AliasMethod(object):
    method __init__ (line 228) | def __init__(self, probs):
    method cuda (line 265) | def cuda(self):
    method draw (line 269) | def draw(self, N):

FILE: mdistiller/distillers/DKD.py
  function dkd_loss (line 8) | def dkd_loss(logits_student, logits_teacher, target, alpha, beta, temper...
  function _get_gt_mask (line 35) | def _get_gt_mask(logits, target):
  function _get_other_mask (line 41) | def _get_other_mask(logits, target):
  function cat_mask (line 47) | def cat_mask(t, mask1, mask2):
  class DKD (line 54) | class DKD(Distiller):
    method __init__ (line 57) | def __init__(self, student, teacher, cfg):
    method forward_train (line 65) | def forward_train(self, image, target, **kwargs):

FILE: mdistiller/distillers/FitNet.py
  class FitNet (line 9) | class FitNet(Distiller):
    method __init__ (line 12) | def __init__(self, student, teacher, cfg):
    method get_learnable_parameters (line 24) | def get_learnable_parameters(self):
    method get_extra_parameters (line 27) | def get_extra_parameters(self):
    method forward_train (line 33) | def forward_train(self, image, target, **kwargs):

FILE: mdistiller/distillers/KD.py
  function kd_loss (line 8) | def kd_loss(logits_student, logits_teacher, temperature):
  class KD (line 16) | class KD(Distiller):
    method __init__ (line 19) | def __init__(self, student, teacher, cfg):
    method forward_train (line 25) | def forward_train(self, image, target, **kwargs):

FILE: mdistiller/distillers/KDSVD.py
  function kdsvd_loss (line 8) | def kdsvd_loss(g_s, g_t, k):
  function svd (line 38) | def svd(feat, n=1):
  function removenan (line 57) | def removenan(x):
  function align_rsv (line 62) | def align_rsv(a, b):
  class KDSVD (line 74) | class KDSVD(Distiller):
    method __init__ (line 80) | def __init__(self, student, teacher, cfg):
    method forward_train (line 86) | def forward_train(self, image, target, **kwargs):

FILE: mdistiller/distillers/NST.py
  function nst_loss (line 8) | def nst_loss(g_s, g_t):
  function single_stage_nst_loss (line 12) | def single_stage_nst_loss(f_s, f_t):
  function poly_kernel (line 31) | def poly_kernel(a, b):
  class NST (line 38) | class NST(Distiller):
    method __init__ (line 43) | def __init__(self, student, teacher, cfg):
    method forward_train (line 48) | def forward_train(self, image, target, **kwargs):

FILE: mdistiller/distillers/OFD.py
  function feat_loss (line 11) | def feat_loss(source, target, margin):
  class ConnectorConvBN (line 22) | class ConnectorConvBN(nn.Module):
    method __init__ (line 23) | def __init__(self, s_channels, t_channels, kernel_size=1):
    method _make_conenctors (line 31) | def _make_conenctors(self, s_channels, t_channels, kernel_size):
    method _build_feature_connector (line 41) | def _build_feature_connector(self, t_channel, s_channel, kernel_size):
    method forward (line 62) | def forward(self, g_s):
  class OFD (line 70) | class OFD(Distiller):
    method __init__ (line 71) | def __init__(self, student, teacher, cfg):
    method init_ofd_modules (line 82) | def init_ofd_modules(
    method get_learnable_parameters (line 111) | def get_learnable_parameters(self):
    method train (line 114) | def train(self, mode=True):
    method get_extra_parameters (line 123) | def get_extra_parameters(self):
    method forward_train (line 129) | def forward_train(self, image, target, **kwargs):
    method ofd_loss (line 142) | def ofd_loss(self, feature_student, feature_teacher):
    method _align_list (line 163) | def _align_list(self, *input_list):

FILE: mdistiller/distillers/PKT.py
  function pkt_loss (line 8) | def pkt_loss(f_s, f_t, eps=1e-7):
  class PKT (line 38) | class PKT(Distiller):
    method __init__ (line 44) | def __init__(self, student, teacher, cfg):
    method forward_train (line 49) | def forward_train(self, image, target, **kwargs):

FILE: mdistiller/distillers/RKD.py
  function _pdist (line 8) | def _pdist(e, squared, eps):
  function rkd_loss (line 21) | def rkd_loss(f_s, f_t, squared=False, eps=1e-12, distance_weight=25, ang...
  class RKD (line 53) | class RKD(Distiller):
    method __init__ (line 56) | def __init__(self, student, teacher, cfg):
    method forward_train (line 65) | def forward_train(self, image, target, **kwargs):

FILE: mdistiller/distillers/ReviewKD.py
  function hcl_loss (line 11) | def hcl_loss(fstudent, fteacher):
  class ReviewKD (line 31) | class ReviewKD(Distiller):
    method __init__ (line 32) | def __init__(self, student, teacher, cfg):
    method get_learnable_parameters (line 57) | def get_learnable_parameters(self):
    method get_extra_parameters (line 60) | def get_extra_parameters(self):
    method forward_train (line 66) | def forward_train(self, image, target, **kwargs):
  class ABF (line 106) | class ABF(nn.Module):
    method __init__ (line 107) | def __init__(self, in_channel, mid_channel, out_channel, fuse):
    method forward (line 129) | def forward(self, x, y=None, shape=None, out_shape=None):

FILE: mdistiller/distillers/SP.py
  function sp_loss (line 8) | def sp_loss(g_s, g_t):
  function similarity_loss (line 12) | def similarity_loss(f_s, f_t):
  class SP (line 27) | class SP(Distiller):
    method __init__ (line 30) | def __init__(self, student, teacher, cfg):
    method forward_train (line 35) | def forward_train(self, image, target, **kwargs):

FILE: mdistiller/distillers/VID.py
  function conv1x1 (line 10) | def conv1x1(in_channels, out_channels, stride=1):
  function vid_loss (line 16) | def vid_loss(regressor, log_scale, f_s, f_t, eps=1e-5):
  class VID (line 33) | class VID(Distiller):
    method __init__ (line 39) | def __init__(self, student, teacher, cfg):
    method init_vid_modules (line 52) | def init_vid_modules(self, feat_s_shapes, feat_t_shapes):
    method get_learnable_parameters (line 65) | def get_learnable_parameters(self):
    method get_extra_parameters (line 71) | def get_extra_parameters(self):
    method forward_train (line 78) | def forward_train(self, image, target, **kwargs):

FILE: mdistiller/distillers/_base.py
  class Distiller (line 6) | class Distiller(nn.Module):
    method __init__ (line 7) | def __init__(self, student, teacher):
    method train (line 12) | def train(self, mode=True):
    method get_learnable_parameters (line 22) | def get_learnable_parameters(self):
    method get_extra_parameters (line 26) | def get_extra_parameters(self):
    method forward_train (line 30) | def forward_train(self, **kwargs):
    method forward_test (line 34) | def forward_test(self, image):
    method forward (line 37) | def forward(self, **kwargs):
  class Vanilla (line 43) | class Vanilla(nn.Module):
    method __init__ (line 44) | def __init__(self, student):
    method get_learnable_parameters (line 48) | def get_learnable_parameters(self):
    method forward_train (line 51) | def forward_train(self, image, target, **kwargs):
    method forward (line 56) | def forward(self, **kwargs):
    method forward_test (line 61) | def forward_test(self, image):

FILE: mdistiller/distillers/_common.py
  class ConvReg (line 6) | class ConvReg(nn.Module):
    method __init__ (line 9) | def __init__(self, s_shape, t_shape, use_relu=True):
    method forward (line 25) | def forward(self, x):
  function get_feat_shapes (line 33) | def get_feat_shapes(student, teacher, input_size):

FILE: mdistiller/engine/cfg.py
  function show_cfg (line 5) | def show_cfg(cfg):

FILE: mdistiller/engine/dot.py
  function check_in (line 9) | def check_in(t, l):
  function dot (line 15) | def dot(params: List[Tensor],
  class DistillationOrientedTrainer (line 58) | class DistillationOrientedTrainer(Optimizer):
    method __init__ (line 73) | def __init__(
    method step_kd (line 97) | def step_kd(self, closure=None):
    method step (line 127) | def step(self, closure=None):

FILE: mdistiller/engine/trainer.py
  class BaseTrainer (line 22) | class BaseTrainer(object):
    method __init__ (line 23) | def __init__(self, experiment_name, distiller, train_loader, val_loade...
    method init_optimizer (line 38) | def init_optimizer(self, cfg):
    method log (line 50) | def log(self, lr, epoch, log_dict):
    method train (line 77) | def train(self, resume=False):
    method train_epoch (line 92) | def train_epoch(self, epoch):
    method train_iter (line 153) | def train_iter(self, data, epoch, train_meters):
  class CRDTrainer (line 189) | class CRDTrainer(BaseTrainer):
    method train_iter (line 190) | def train_iter(self, data, epoch, train_meters):
  class DOT (line 229) | class DOT(BaseTrainer):
    method init_optimizer (line 230) | def init_optimizer(self, cfg):
    method train (line 245) | def train(self, resume=False):
    method train_iter (line 260) | def train_iter(self, data, epoch, train_meters):
  class CRDDOT (line 300) | class CRDDOT(BaseTrainer):
    method init_optimizer (line 302) | def init_optimizer(self, cfg):
    method train (line 317) | def train(self, resume=False):
    method train_iter (line 332) | def train_iter(self, data, epoch, train_meters):

FILE: mdistiller/engine/utils.py
  class AverageMeter (line 10) | class AverageMeter(object):
    method __init__ (line 13) | def __init__(self):
    method reset (line 16) | def reset(self):
    method update (line 22) | def update(self, val, n=1):
  function validate (line 29) | def validate(val_loader, distiller):
  function log_msg (line 62) | def log_msg(msg, mode="INFO"):
  function adjust_learning_rate (line 72) | def adjust_learning_rate(epoch, cfg, optimizer):
  function accuracy (line 82) | def accuracy(output, target, topk=(1,)):
  function save_checkpoint (line 96) | def save_checkpoint(obj, path):
  function load_checkpoint (line 101) | def load_checkpoint(path):

FILE: mdistiller/models/cifar/ShuffleNetv1.py
  class ShuffleBlock (line 6) | class ShuffleBlock(nn.Module):
    method __init__ (line 7) | def __init__(self, groups):
    method forward (line 11) | def forward(self, x):
  class Bottleneck (line 18) | class Bottleneck(nn.Module):
    method __init__ (line 19) | def __init__(self, in_planes, out_planes, stride, groups, is_last=False):
    method forward (line 50) | def forward(self, x):
  class ShuffleNet (line 65) | class ShuffleNet(nn.Module):
    method __init__ (line 66) | def __init__(self, cfg, num_classes=10):
    method _make_layer (line 81) | def _make_layer(self, out_planes, num_blocks, groups):
    method get_feat_modules (line 98) | def get_feat_modules(self):
    method get_bn_before_relu (line 107) | def get_bn_before_relu(self):
    method forward (line 112) | def forward(
  function ShuffleV1 (line 137) | def ShuffleV1(**kwargs):

FILE: mdistiller/models/cifar/ShuffleNetv2.py
  class ShuffleBlock (line 6) | class ShuffleBlock(nn.Module):
    method __init__ (line 7) | def __init__(self, groups=2):
    method forward (line 11) | def forward(self, x):
  class SplitBlock (line 18) | class SplitBlock(nn.Module):
    method __init__ (line 19) | def __init__(self, ratio):
    method forward (line 23) | def forward(self, x):
  class BasicBlock (line 28) | class BasicBlock(nn.Module):
    method __init__ (line 29) | def __init__(self, in_channels, split_ratio=0.5, is_last=False):
    method forward (line 50) | def forward(self, x):
  class DownBlock (line 66) | class DownBlock(nn.Module):
    method __init__ (line 67) | def __init__(self, in_channels, out_channels):
    method forward (line 101) | def forward(self, x):
  class ShuffleNetV2 (line 115) | class ShuffleNetV2(nn.Module):
    method __init__ (line 116) | def __init__(self, net_size, num_classes=10):
    method _make_layer (line 141) | def _make_layer(self, out_channels, num_blocks):
    method get_feat_modules (line 148) | def get_feat_modules(self):
    method get_bn_before_relu (line 157) | def get_bn_before_relu(self):
    method get_stage_channels (line 162) | def get_stage_channels(self):
    method forward (line 165) | def forward(self, x):
  function ShuffleV2 (line 200) | def ShuffleV2(**kwargs):

FILE: mdistiller/models/cifar/mobilenetv2.py
  function conv_bn (line 10) | def conv_bn(inp, oup, stride):
  function conv_1x1_bn (line 18) | def conv_1x1_bn(inp, oup):
  class InvertedResidual (line 26) | class InvertedResidual(nn.Module):
    method __init__ (line 27) | def __init__(self, inp, oup, stride, expand_ratio):
    method forward (line 59) | def forward(self, x):
  class MobileNetV2 (line 67) | class MobileNetV2(nn.Module):
    method __init__ (line 70) | def __init__(self, T, feature_dim, input_size=32, width_mult=1.0, remo...
    method get_bn_before_relu (line 121) | def get_bn_before_relu(self):
    method get_feat_modules (line 128) | def get_feat_modules(self):
    method get_stage_channels (line 134) | def get_stage_channels(self):
    method forward (line 137) | def forward(self, x):
    method _initialize_weights (line 165) | def _initialize_weights(self):
  function mobilenetv2_T_w (line 181) | def mobilenetv2_T_w(T, W, feature_dim=100):
  function mobile_half (line 186) | def mobile_half(num_classes):

FILE: mdistiller/models/cifar/mv2_tinyimagenet.py
  class LinearBottleNeck (line 6) | class LinearBottleNeck(nn.Module):
    method __init__ (line 8) | def __init__(self, in_channels, out_channels, stride, t=6, class_num=1...
    method forward (line 28) | def forward(self, x):
  class MobileNetV2 (line 37) | class MobileNetV2(nn.Module):
    method __init__ (line 39) | def __init__(self, num_classes=100):
    method forward (line 64) | def forward(self, x):
    method _make_stage (line 89) | def _make_stage(self, repeat, in_channels, out_channels, stride, t):
  function mobilenetv2_tinyimagenet (line 100) | def mobilenetv2_tinyimagenet(**kwargs):

FILE: mdistiller/models/cifar/resnet.py
  function conv3x3 (line 9) | def conv3x3(in_planes, out_planes, stride=1):
  class BasicBlock (line 16) | class BasicBlock(nn.Module):
    method __init__ (line 19) | def __init__(self, inplanes, planes, stride=1, downsample=None, is_las...
    method forward (line 30) | def forward(self, x):
  class Bottleneck (line 52) | class Bottleneck(nn.Module):
    method __init__ (line 55) | def __init__(self, inplanes, planes, stride=1, downsample=None, is_las...
    method forward (line 70) | def forward(self, x):
  class ResNet (line 96) | class ResNet(nn.Module):
    method __init__ (line 97) | def __init__(self, depth, num_filters, block_name="BasicBlock", num_cl...
    method _make_layer (line 133) | def _make_layer(self, block, planes, blocks, stride=1):
    method get_feat_modules (line 157) | def get_feat_modules(self):
    method get_bn_before_relu (line 167) | def get_bn_before_relu(self):
    method get_stage_channels (line 181) | def get_stage_channels(self):
    method forward (line 184) | def forward(self, x):
  function resnet8 (line 209) | def resnet8(**kwargs):
  function resnet14 (line 213) | def resnet14(**kwargs):
  function resnet20 (line 217) | def resnet20(**kwargs):
  function resnet32 (line 221) | def resnet32(**kwargs):
  function resnet44 (line 225) | def resnet44(**kwargs):
  function resnet56 (line 229) | def resnet56(**kwargs):
  function resnet110 (line 233) | def resnet110(**kwargs):
  function resnet8x4 (line 237) | def resnet8x4(**kwargs):
  function resnet32x4 (line 241) | def resnet32x4(**kwargs):

FILE: mdistiller/models/cifar/resnetv2.py
  class BasicBlock (line 6) | class BasicBlock(nn.Module):
    method __init__ (line 9) | def __init__(self, in_planes, planes, stride=1, is_last=False):
    method forward (line 34) | def forward(self, x):
  class Bottleneck (line 46) | class Bottleneck(nn.Module):
    method __init__ (line 49) | def __init__(self, in_planes, planes, stride=1, is_last=False):
    method forward (line 76) | def forward(self, x):
  class ResNet (line 89) | class ResNet(nn.Module):
    method __init__ (line 90) | def __init__(self, block, num_blocks, num_classes=10, zero_init_residu...
    method get_feat_modules (line 121) | def get_feat_modules(self):
    method get_bn_before_relu (line 131) | def get_bn_before_relu(self):
    method get_stage_channels (line 147) | def get_stage_channels(self):
    method _make_layer (line 150) | def _make_layer(self, block, planes, num_blocks, stride):
    method encode (line 159) | def encode(self, x, idx, preact=False):
    method forward (line 170) | def forward(self, x):
  function ResNet18 (line 193) | def ResNet18(**kwargs):
  function ResNet34 (line 197) | def ResNet34(**kwargs):
  function ResNet50 (line 201) | def ResNet50(**kwargs):
  function ResNet101 (line 205) | def ResNet101(**kwargs):
  function ResNet152 (line 209) | def ResNet152(**kwargs):

FILE: mdistiller/models/cifar/vgg.py
  class VGG (line 27) | class VGG(nn.Module):
    method __init__ (line 28) | def __init__(self, cfg, batch_norm=False, num_classes=1000):
    method get_feat_modules (line 47) | def get_feat_modules(self):
    method get_bn_before_relu (line 61) | def get_bn_before_relu(self):
    method get_stage_channels (line 68) | def get_stage_channels(self):
    method forward (line 71) | def forward(self, x):
    method _make_layers (line 109) | def _make_layers(cfg, batch_norm=False, in_channels=3):
    method _initialize_weights (line 124) | def _initialize_weights(self):
  function vgg8 (line 155) | def vgg8(**kwargs):
  function vgg8_bn (line 164) | def vgg8_bn(**kwargs):
  function vgg11 (line 173) | def vgg11(**kwargs):
  function vgg11_bn (line 182) | def vgg11_bn(**kwargs):
  function vgg13 (line 188) | def vgg13(**kwargs):
  function vgg13_bn (line 197) | def vgg13_bn(**kwargs):
  function vgg16 (line 203) | def vgg16(**kwargs):
  function vgg16_bn (line 212) | def vgg16_bn(**kwargs):
  function vgg19 (line 218) | def vgg19(**kwargs):
  function vgg19_bn (line 227) | def vgg19_bn(**kwargs):

FILE: mdistiller/models/cifar/wrn.py
  class BasicBlock (line 10) | class BasicBlock(nn.Module):
    method __init__ (line 11) | def __init__(self, in_planes, out_planes, stride, dropRate=0.0):
    method forward (line 38) | def forward(self, x):
  class NetworkBlock (line 50) | class NetworkBlock(nn.Module):
    method __init__ (line 51) | def __init__(self, nb_layers, in_planes, out_planes, block, stride, dr...
    method _make_layer (line 57) | def _make_layer(self, block, in_planes, out_planes, nb_layers, stride,...
    method forward (line 70) | def forward(self, x):
  class WideResNet (line 74) | class WideResNet(nn.Module):
    method __init__ (line 75) | def __init__(self, depth, num_classes, widen_factor=1, dropRate=0.0):
    method get_feat_modules (line 108) | def get_feat_modules(self):
    method get_bn_before_relu (line 116) | def get_bn_before_relu(self):
    method get_stage_channels (line 123) | def get_stage_channels(self):
    method forward (line 126) | def forward(self, x):
  function wrn (line 153) | def wrn(**kwargs):
  function wrn_40_2 (line 161) | def wrn_40_2(**kwargs):
  function wrn_40_1 (line 166) | def wrn_40_1(**kwargs):
  function wrn_16_2 (line 171) | def wrn_16_2(**kwargs):
  function wrn_16_1 (line 176) | def wrn_16_1(**kwargs):

FILE: mdistiller/models/imagenet/mobilenetv1.py
  class MobileNetV1 (line 6) | class MobileNetV1(nn.Module):
    method __init__ (line 7) | def __init__(self, **kwargs):
    method forward (line 46) | def forward(self, x, is_feat=False):
    method get_bn_before_relu (line 61) | def get_bn_before_relu(self):
    method get_stage_channels (line 68) | def get_stage_channels(self):

FILE: mdistiller/models/imagenet/resnet.py
  function conv3x3 (line 20) | def conv3x3(in_planes, out_planes, stride=1):
  class BasicBlock (line 27) | class BasicBlock(nn.Module):
    method __init__ (line 30) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 40) | def forward(self, x):
  class Bottleneck (line 60) | class Bottleneck(nn.Module):
    method __init__ (line 63) | def __init__(self, inplanes, planes, stride=1, downsample=None):
    method forward (line 77) | def forward(self, x):
  class ResNet (line 101) | class ResNet(nn.Module):
    method __init__ (line 102) | def __init__(self, block, layers, num_classes=1000):
    method _make_layer (line 124) | def _make_layer(self, block, planes, blocks, stride=1):
    method get_bn_before_relu (line 145) | def get_bn_before_relu(self):
    method get_stage_channels (line 161) | def get_stage_channels(self):
    method forward (line 164) | def forward(self, x):
  function resnet18 (line 195) | def resnet18(pretrained=False, **kwargs):
  function resnet34 (line 206) | def resnet34(pretrained=False, **kwargs):
  function resnet50 (line 217) | def resnet50(pretrained=False, **kwargs):
  function resnet101 (line 228) | def resnet101(pretrained=False, **kwargs):
  function resnet152 (line 239) | def resnet152(pretrained=False, **kwargs):

FILE: tools/train.py
  function main (line 18) | def main(cfg, resume, opts):

Download .json

Condensed preview — 117 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (662K chars).

[
  {
    "path": ".gitattributes",
    "chars": 27,
    "preview": "* linguist-language=Python\n"
  },
  {
    "path": ".gitignore",
    "chars": 1376,
    "preview": ".idea/\ndata\ndata/\noutput\noutput*/\ndetection/datasets\ndetection/output\ndetection/output*/\nckpts/\n*.pth\n*.t7\ntmp*.py\n\n*.pd"
  },
  {
    "path": "LICENSE",
    "chars": 1701,
    "preview": "MIT License\n\nCopyright (c) 2022 MEGVII Research\n\nPermission is hereby granted, free of charge, to any person obtaining a"
  },
  {
    "path": "README.md",
    "chars": 7712,
    "preview": "<div align=center><img src=\".github/mdistiller.png\" width=\"40%\" ><div align=left>\n\nThis repo is\n\n(1) a PyTorch library t"
  },
  {
    "path": "configs/cifar100/at.yaml",
    "chars": 313,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"at,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"AT\"\n  TEACHER: \"re"
  },
  {
    "path": "configs/cifar100/crd.yaml",
    "chars": 332,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"crd,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"CRD\"\n  TEACHER: \""
  },
  {
    "path": "configs/cifar100/dkd/res110_res32.yaml",
    "chars": 327,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,res110,res32\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"DKD\"\n  TEACHER: \"re"
  },
  {
    "path": "configs/cifar100/dkd/res32x4_res8x4.yaml",
    "chars": 315,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"DKD\"\n  TEACHER: \""
  },
  {
    "path": "configs/cifar100/dkd/res32x4_shuv1.yaml",
    "chars": 314,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,res32x4,shuv1\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"DKD\"\n  TEACHER: \"r"
  },
  {
    "path": "configs/cifar100/dkd/res32x4_shuv2.yaml",
    "chars": 314,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,res32x4,shuv2\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"DKD\"\n  TEACHER: \"r"
  },
  {
    "path": "configs/cifar100/dkd/res50_mv2.yaml",
    "chars": 310,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,res50,mv2\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"DKD\"\n  TEACHER: \"ResNe"
  },
  {
    "path": "configs/cifar100/dkd/res56_res20.yaml",
    "chars": 325,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,res56,res20\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"DKD\"\n  TEACHER: \"res"
  },
  {
    "path": "configs/cifar100/dkd/vgg13_mv2.yaml",
    "chars": 324,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,vgg13,mv2\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"DKD\"\n  TEACHER: \"vgg13"
  },
  {
    "path": "configs/cifar100/dkd/vgg13_vgg8.yaml",
    "chars": 317,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,vgg13,vgg8\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"DKD\"\n  TEACHER: \"vgg1"
  },
  {
    "path": "configs/cifar100/dkd/wrn40_2_shuv1.yaml",
    "chars": 313,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,wrn_40_2,shuv1\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"DKD\"\n  TEACHER: \""
  },
  {
    "path": "configs/cifar100/dkd/wrn40_2_wrn_16_2.yaml",
    "chars": 331,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,wrn_40_2,wrn_16_2\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"DKD\"\n  TEACHER"
  },
  {
    "path": "configs/cifar100/dkd/wrn40_2_wrn_40_1.yaml",
    "chars": 331,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,wrn_40_2,wrn_40_1\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"DKD\"\n  TEACHER"
  },
  {
    "path": "configs/cifar100/dot/res32x4_res8x4.yaml",
    "chars": 348,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"kd,dot,res32x4,res8x4\"\n  PROJECT: \"dot_cifar\"\nDISTILLER:\n  TYPE: \"KD\"\n  TEACHER: \"resnet3"
  },
  {
    "path": "configs/cifar100/dot/res32x4_shuv2.yaml",
    "chars": 347,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"kd,dot,res32x4,shuv2\"\n  PROJECT: \"dot_cifar\"\nDISTILLER:\n  TYPE: \"KD\"\n  TEACHER: \"resnet32"
  },
  {
    "path": "configs/cifar100/dot/vgg13_vgg8.yaml",
    "chars": 334,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"kd,dot,vgg13,vgg8\"\n  PROJECT: \"dot_cifar\"\nDISTILLER:\n  TYPE: \"KD\"\n  TEACHER: \"vgg13\"\n  ST"
  },
  {
    "path": "configs/cifar100/fitnet.yaml",
    "chars": 320,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"fitnet,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"FITNET\"\n  TEAC"
  },
  {
    "path": "configs/cifar100/kd.yaml",
    "chars": 312,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"kd,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"KD\"\n  TEACHER: \"re"
  },
  {
    "path": "configs/cifar100/kdsvd.yaml",
    "chars": 318,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"kdsvd,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"KDSVD\"\n  TEACHE"
  },
  {
    "path": "configs/cifar100/nst.yaml",
    "chars": 314,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"nst,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"NST\"\n  TEACHER: \""
  },
  {
    "path": "configs/cifar100/ofd.yaml",
    "chars": 315,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"ofd,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"OFD\"\n  TEACHER: \""
  },
  {
    "path": "configs/cifar100/pkt.yaml",
    "chars": 314,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"pkt,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"PKT\"\n  TEACHER: \""
  },
  {
    "path": "configs/cifar100/reviewkd.yaml",
    "chars": 358,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"reviewkd,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"REVIEWKD\"\n  "
  },
  {
    "path": "configs/cifar100/rkd.yaml",
    "chars": 314,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"rkd,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"RKD\"\n  TEACHER: \""
  },
  {
    "path": "configs/cifar100/sp.yaml",
    "chars": 312,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"sp,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"SP\"\n  TEACHER: \"re"
  },
  {
    "path": "configs/cifar100/vanilla.yaml",
    "chars": 319,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"vanilla,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"NONE\"\n  TEACH"
  },
  {
    "path": "configs/cifar100/vid.yaml",
    "chars": 314,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"vid,res32x4,res8x4\"\n  PROJECT: \"cifar100_baselines\"\nDISTILLER:\n  TYPE: \"VID\"\n  TEACHER: \""
  },
  {
    "path": "configs/imagenet/r34_r18/at.yaml",
    "chars": 432,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"at,res34,res18\"\n  PROJECT: \"imagenet_baselines\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_WORKERS"
  },
  {
    "path": "configs/imagenet/r34_r18/crd.yaml",
    "chars": 507,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"crd,res34,res18\"\n  PROJECT: \"imagenet_baselines\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_WORKER"
  },
  {
    "path": "configs/imagenet/r34_r18/dkd.yaml",
    "chars": 489,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,res34,res18\"\n  PROJECT: \"imagenet_baselines\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_WORKER"
  },
  {
    "path": "configs/imagenet/r34_r18/dot.yaml",
    "chars": 537,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"kd,dot,res34,res18\"\n  PROJECT: \"dot_imagenet\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_WORKERS: "
  },
  {
    "path": "configs/imagenet/r34_r18/kd.yaml",
    "chars": 499,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"kd,res34,res18\"\n  PROJECT: \"imagenet_baselines\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_WORKERS"
  },
  {
    "path": "configs/imagenet/r34_r18/reviewkd.yaml",
    "chars": 659,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"reviewkd,res34,res18\"\n  PROJECT: \"imagenet_baselines\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_W"
  },
  {
    "path": "configs/imagenet/r50_mv1/at.yaml",
    "chars": 441,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"at,res50,mobilenetv1\"\n  PROJECT: \"imagenet_baselines\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_W"
  },
  {
    "path": "configs/imagenet/r50_mv1/crd.yaml",
    "chars": 518,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"crd,res50,mobilenetv1\"\n  PROJECT: \"imagenet_baselines\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_"
  },
  {
    "path": "configs/imagenet/r50_mv1/dkd.yaml",
    "chars": 498,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"dkd,res50,mobilenetv1\"\n  PROJECT: \"imagenet_baselines\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_"
  },
  {
    "path": "configs/imagenet/r50_mv1/dot.yaml",
    "chars": 546,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"kd,dot,res50,mobilenetv1\"\n  PROJECT: \"dot_imagenet\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_WOR"
  },
  {
    "path": "configs/imagenet/r50_mv1/kd.yaml",
    "chars": 508,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"kd,res50,mobilenetv1\"\n  PROJECT: \"imagenet_baselines\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_W"
  },
  {
    "path": "configs/imagenet/r50_mv1/ofd.yaml",
    "chars": 481,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"ofd,res50,mobilenetv1\"\n  PROJECT: \"imagenet_baselines\"\nDATASET:\n  TYPE: \"imagenet\"\n  NUM_"
  },
  {
    "path": "configs/imagenet/r50_mv1/reviewkd.yaml",
    "chars": 698,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"reviewkd,res50,mobilenetv1\"\n  PROJECT: \"imagenet_baselines\"\nDATASET:\n  TYPE: \"imagenet\"\n "
  },
  {
    "path": "configs/tiny_imagenet/dot/r18_mv2.yaml",
    "chars": 398,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"kd,dot,r18,mv2\"\n  PROJECT: \"dot_tinyimagenet\"\nDATASET:\n  TYPE: \"tiny_imagenet\"\n  NUM_WORK"
  },
  {
    "path": "configs/tiny_imagenet/dot/r18_shuv2.yaml",
    "chars": 398,
    "preview": "EXPERIMENT:\n  NAME: \"\"\n  TAG: \"kd,dot,r18,shuv2\"\n  PROJECT: \"dot_tinyimagenet\"\nDATASET:\n  TYPE: \"tiny_imagenet\"\n  NUM_WO"
  },
  {
    "path": "detection/README.md",
    "chars": 1126,
    "preview": "# COCO object detection and instance segmentation\n\nPS: based on the [ReviewKD's codebase](https://github.com/dvlab-resea"
  },
  {
    "path": "detection/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "detection/configs/Base-Distillation.yaml",
    "chars": 2484,
    "preview": "MODEL:\n  META_ARCHITECTURE: \"RCNNKD\"\n  BACKBONE:\n    NAME: \"build_resnet_fpn_backbone_kd\"\n  RESNETS:\n    OUT_FEATURES: ["
  },
  {
    "path": "detection/configs/DKD/DKD-MV2-R50.yaml",
    "chars": 530,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/DKD-MV2-R50\nMODEL:\n  BACKBONE:\n    NAME: \"build_mobilenetv2_fpn_b"
  },
  {
    "path": "detection/configs/DKD/DKD-R18-R101.yaml",
    "chars": 627,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/DKD-R18-R101\nINPUT:\n  FORMAT: 'RGB'\nMODEL:\n  PIXEL_STD: [57.375, "
  },
  {
    "path": "detection/configs/DKD/DKD-R50-R101.yaml",
    "chars": 540,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/DKD-R50-R101\nMODEL:\n  BACKBONE:\n    NAME: \"build_resnet_fpn_backb"
  },
  {
    "path": "detection/configs/DKD/ReviewDKD-MV2-R50.yaml",
    "chars": 576,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/ReviewDKD-MV2-R50\nMODEL:\n  BACKBONE:\n    NAME: \"build_mobilenetv2"
  },
  {
    "path": "detection/configs/DKD/ReviewDKD-R18-R101.yaml",
    "chars": 672,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/ReviewDKD-R18-R101\nINPUT:\n  FORMAT: 'RGB'\nMODEL:\n  PIXEL_STD: [57"
  },
  {
    "path": "detection/configs/DKD/ReviewDKD-R50-R101.yaml",
    "chars": 585,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/ReviewDKD-R50-R101\nMODEL:\n  BACKBONE:\n    NAME: \"build_resnet_fpn"
  },
  {
    "path": "detection/configs/ReviewKD/ReviewKD-MV2-R50-Mask.yaml",
    "chars": 598,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/ReviewKD-MV2-R50-Mask\nMODEL:\n  MASK_ON: True\n  BACKBONE:\n    NAME"
  },
  {
    "path": "detection/configs/ReviewKD/ReviewKD-MV2-R50.yaml",
    "chars": 574,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/ReviewKD-MV2-R50\nMODEL:\n  BACKBONE:\n    NAME: \"build_mobilenetv2_"
  },
  {
    "path": "detection/configs/ReviewKD/ReviewKD-R18-R101-Mask.yaml",
    "chars": 695,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/ReviewKD-R18-R101-Mask\nINPUT:\n  FORMAT: 'RGB'\nMODEL:\n  PIXEL_STD:"
  },
  {
    "path": "detection/configs/ReviewKD/ReviewKD-R18-R101.yaml",
    "chars": 670,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/ReviewKD-R18-R101\nINPUT:\n  FORMAT: 'RGB'\nMODEL:\n  PIXEL_STD: [57."
  },
  {
    "path": "detection/configs/ReviewKD/ReviewKD-R50-R101-Mask.yaml",
    "chars": 608,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/ReviewKD-R50-R101-Mask\nMODEL:\n  MASK_ON: True\n  BACKBONE:\n    NAM"
  },
  {
    "path": "detection/configs/ReviewKD/ReviewKD-R50-R101.yaml",
    "chars": 583,
    "preview": "_BASE_: \"../Base-Distillation.yaml\"\nOUTPUT_DIR: output/ReviewKD-R50-R101\nMODEL:\n  BACKBONE:\n    NAME: \"build_resnet_fpn_"
  },
  {
    "path": "detection/model/__init__.py",
    "chars": 131,
    "preview": "import torch\n\nfrom .rcnn import RCNNKD\nfrom .config import add_distillation_cfg\nfrom .backbone import build_resnet_fpn_b"
  },
  {
    "path": "detection/model/backbone/__init__.py",
    "chars": 123,
    "preview": "from .resnet import build_resnet_backbone_kd\nfrom .fpn import build_resnet_fpn_backbone_kd, build_mobilenetv2_fpn_backbo"
  },
  {
    "path": "detection/model/backbone/fpn.py",
    "chars": 1630,
    "preview": "from .resnet import build_resnet_backbone_kd\nfrom .mobilenetv2 import build_mobilenetv2_backbone\nfrom detectron2.modelin"
  },
  {
    "path": "detection/model/backbone/mobilenetv2.py",
    "chars": 7054,
    "preview": "\"\"\"\nCreates a MobileNetV2 Model as defined in:\nMark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh "
  },
  {
    "path": "detection/model/backbone/resnet.py",
    "chars": 17668,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\nimport numpy as np\nimport fvcore.nn.weight_init a"
  },
  {
    "path": "detection/model/config.py",
    "chars": 32549,
    "preview": "from detectron2.config import CfgNode as CN\nimport numpy as np\n\ndef add_distillation_cfg(cfg):\n    cfg.MODEL.MOBILENETV2"
  },
  {
    "path": "detection/model/rcnn.py",
    "chars": 14730,
    "preview": "# Copyright (c) Facebook, Inc. and its affiliates.\nimport logging\nimport numpy as np\nfrom typing import Dict, List, Opti"
  },
  {
    "path": "detection/model/reviewkd.py",
    "chars": 2982,
    "preview": "import torch\nfrom torch import nn\nimport torch.nn.functional as F\n\nclass ABF(nn.Module):\n    def __init__(self, in_chann"
  },
  {
    "path": "detection/model/teacher/__init__.py",
    "chars": 35,
    "preview": "from .teacher import build_teacher\n"
  },
  {
    "path": "detection/model/teacher/teacher.py",
    "chars": 1060,
    "preview": "from detectron2.modeling.backbone import build_backbone\nfrom detectron2.modeling.proposal_generator import build_proposa"
  },
  {
    "path": "detection/train_net.py",
    "chars": 6295,
    "preview": "#!/usr/bin/env python\n# Copyright (c) Facebook, Inc. and its affiliates.\n\"\"\"\nA main training script.\n\nThis scripts reads"
  },
  {
    "path": "mdistiller/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "mdistiller/dataset/__init__.py",
    "chars": 2448,
    "preview": "from .cifar100 import get_cifar100_dataloaders, get_cifar100_dataloaders_sample\nfrom .imagenet import get_imagenet_datal"
  },
  {
    "path": "mdistiller/dataset/cifar100.py",
    "chars": 5741,
    "preview": "import os\nimport numpy as np\nfrom torch.utils.data import DataLoader\nfrom torchvision import datasets, transforms\nfrom P"
  },
  {
    "path": "mdistiller/dataset/imagenet.py",
    "chars": 4748,
    "preview": "import os\nimport numpy as np\nimport torch\nfrom torchvision.datasets import ImageFolder\nimport torchvision.transforms as "
  },
  {
    "path": "mdistiller/dataset/tiny_imagenet.py",
    "chars": 4823,
    "preview": "import os\nfrom torch.utils.data import DataLoader\nfrom torchvision import datasets\nfrom torchvision import transforms\nim"
  },
  {
    "path": "mdistiller/distillers/AT.py",
    "chars": 1624,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom ._base import Distiller\n\n\ndef single_stage_at_l"
  },
  {
    "path": "mdistiller/distillers/CRD.py",
    "chars": 9154,
    "preview": "import torch\nfrom torch import nn\nimport torch.nn.functional as F\nimport math\n\nfrom ._base import Distiller\n\n\nclass CRD("
  },
  {
    "path": "mdistiller/distillers/DKD.py",
    "chars": 2690,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom ._base import Distiller\n\n\ndef dkd_loss(logits_s"
  },
  {
    "path": "mdistiller/distillers/FitNet.py",
    "chars": 1642,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom ._base import Distiller\nfrom ._common import Co"
  },
  {
    "path": "mdistiller/distillers/KD.py",
    "chars": 1304,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom ._base import Distiller\n\n\ndef kd_loss(logits_st"
  },
  {
    "path": "mdistiller/distillers/KDSVD.py",
    "chars": 2772,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom ._base import Distiller\n\n\ndef kdsvd_loss(g_s, g"
  },
  {
    "path": "mdistiller/distillers/NST.py",
    "chars": 1766,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom ._base import Distiller\n\n\ndef nst_loss(g_s, g_t"
  },
  {
    "path": "mdistiller/distillers/OFD.py",
    "chars": 5779,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom scipy.stats import norm\nimport numpy as np\nimpor"
  },
  {
    "path": "mdistiller/distillers/PKT.py",
    "chars": 2144,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom ._base import Distiller\n\n\ndef pkt_loss(f_s, f_t"
  },
  {
    "path": "mdistiller/distillers/RKD.py",
    "chars": 2513,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom ._base import Distiller\n\n\ndef _pdist(e, squared"
  },
  {
    "path": "mdistiller/distillers/ReviewKD.py",
    "chars": 5093,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\nimport math\nimport pdb\n\nf"
  },
  {
    "path": "mdistiller/distillers/SP.py",
    "chars": 1451,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nfrom ._base import Distiller\n\n\ndef sp_loss(g_s, g_t)"
  },
  {
    "path": "mdistiller/distillers/VID.py",
    "chars": 3498,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport numpy as np\n\nfrom ._base import Distiller\nfrom"
  },
  {
    "path": "mdistiller/distillers/__init__.py",
    "chars": 579,
    "preview": "from ._base import Vanilla\nfrom .KD import KD\nfrom .AT import AT\nfrom .OFD import OFD\nfrom .RKD import RKD\nfrom .FitNet "
  },
  {
    "path": "mdistiller/distillers/_base.py",
    "chars": 1912,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass Distiller(nn.Module):\n    def __init__(self, "
  },
  {
    "path": "mdistiller/distillers/_common.py",
    "chars": 1355,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass ConvReg(nn.Module):\n    \"\"\"Convolutional regr"
  },
  {
    "path": "mdistiller/engine/__init__.py",
    "chars": 164,
    "preview": "from .trainer import BaseTrainer, CRDTrainer, DOT, CRDDOT\ntrainer_dict = {\n    \"base\": BaseTrainer,\n    \"crd\": CRDTraine"
  },
  {
    "path": "mdistiller/engine/cfg.py",
    "chars": 3776,
    "preview": "from yacs.config import CfgNode as CN\nfrom .utils import log_msg\n\n\ndef show_cfg(cfg):\n    dump_cfg = CN()\n    dump_cfg.E"
  },
  {
    "path": "mdistiller/engine/dot.py",
    "chars": 6472,
    "preview": "import math\nimport torch\nfrom torch import Tensor\nimport torch.optim._functional as F\nfrom torch.optim.optimizer import "
  },
  {
    "path": "mdistiller/engine/trainer.py",
    "chars": 14478,
    "preview": "import os\nimport time\nfrom tqdm import tqdm\nimport torch\nimport torch.nn as nn\nimport torch.optim as optim\nfrom collecti"
  },
  {
    "path": "mdistiller/engine/utils.py",
    "chars": 2948,
    "preview": "import os\nimport torch\nimport torch.nn as nn\nimport numpy as np\nimport sys\nimport time\nfrom tqdm import tqdm\n\n\nclass Ave"
  },
  {
    "path": "mdistiller/models/__init__.py",
    "chars": 104,
    "preview": "from .cifar import cifar_model_dict, tiny_imagenet_model_dict\nfrom .imagenet import imagenet_model_dict\n"
  },
  {
    "path": "mdistiller/models/cifar/ShuffleNetv1.py",
    "chars": 4779,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass ShuffleBlock(nn.Module):\n    def __init__(sel"
  },
  {
    "path": "mdistiller/models/cifar/ShuffleNetv2.py",
    "chars": 6998,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass ShuffleBlock(nn.Module):\n    def __init__(sel"
  },
  {
    "path": "mdistiller/models/cifar/__init__.py",
    "chars": 2275,
    "preview": "import os\nfrom .resnet import (\n    resnet8,\n    resnet14,\n    resnet20,\n    resnet32,\n    resnet44,\n    resnet56,\n    r"
  },
  {
    "path": "mdistiller/models/cifar/mobilenetv2.py",
    "chars": 5693,
    "preview": "import torch\nimport torch.nn as nn\nimport math\n\n__all__ = [\"mobilenetv2_T_w\", \"mobile_half\"]\n\nBN = None\n\n\ndef conv_bn(in"
  },
  {
    "path": "mdistiller/models/cifar/mv2_tinyimagenet.py",
    "chars": 2835,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass LinearBottleNeck(nn.Module):\n\n    def __init_"
  },
  {
    "path": "mdistiller/models/cifar/resnet.py",
    "chars": 7572,
    "preview": "from __future__ import absolute_import\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\n__all__ = [\"resnet\"]\n\n\nde"
  },
  {
    "path": "mdistiller/models/cifar/resnetv2.py",
    "chars": 7246,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass BasicBlock(nn.Module):\n    expansion = 1\n\n   "
  },
  {
    "path": "mdistiller/models/cifar/vgg.py",
    "chars": 6890,
    "preview": "import torch.nn as nn\nimport torch.nn.functional as F\nimport math\n\n\n__all__ = [\n    \"VGG\",\n    \"vgg11\",\n    \"vgg11_bn\",\n"
  },
  {
    "path": "mdistiller/models/cifar/wrn.py",
    "chars": 5705,
    "preview": "import math\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\n__all__ = [\"wrn\"]\n\n\nclass BasicBlock(nn"
  },
  {
    "path": "mdistiller/models/imagenet/__init__.py",
    "chars": 274,
    "preview": "from .resnet import resnet18, resnet34, resnet50, resnet101, resnet152\nfrom .mobilenetv1 import MobileNetV1\n\n\nimagenet_m"
  },
  {
    "path": "mdistiller/models/imagenet/mobilenetv1.py",
    "chars": 2281,
    "preview": "import torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass MobileNetV1(nn.Module):\n    def __init__(self"
  },
  {
    "path": "mdistiller/models/imagenet/resnet.py",
    "chars": 7702,
    "preview": "import torch\nimport torch.nn as nn\nimport math\nimport torch.utils.model_zoo as model_zoo\nimport torch.nn.functional as F"
  },
  {
    "path": "requirements.txt",
    "chars": 88,
    "preview": "torch==1.9.0\ntorchvision==0.10.0\ntensorboard-logger==0.1.0\nyacs\nwandb\ntqdm\ntensorboardX\n"
  },
  {
    "path": "setup.py",
    "chars": 637,
    "preview": "\"\"\"MDistiller: a deep learning toolkit for knowledge distillation.\n\"\"\"\n\nimport os.path\nimport sys\nimport setuptools\n\n\nif"
  },
  {
    "path": "tools/eval.py",
    "chars": 1999,
    "preview": "import argparse\nimport torch\nimport torch.backends.cudnn as cudnn\n\ncudnn.benchmark = True\n\nfrom mdistiller.distillers im"
  },
  {
    "path": "tools/train.py",
    "chars": 4095,
    "preview": "import os\nimport argparse\nimport torch\nimport torch.nn as nn\nimport torch.backends.cudnn as cudnn\n\ncudnn.benchmark = Tru"
  },
  {
    "path": "tools/visualizations/correlation.ipynb",
    "chars": 102365,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Correlation matrices visualizatio"
  },
  {
    "path": "tools/visualizations/tsne.ipynb",
    "chars": 259184,
    "preview": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# t-SNE visualization of CIFAR-100 "
  }
]

About this extraction

This page contains the full source code of the megvii-research/mdistiller GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 117 files (626.8 KB), approximately 323.7k tokens, and a symbol index with 408 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo