Repository: orhir/PoseAnything
Branch: main
Commit: 7f1253f8c6a9
Files: 95
Total size: 575.5 KB
Directory structure:
gitextract_qsqas83s/
├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── configs/
│ ├── 1shot-swin/
│ │ ├── base_split1_config.py
│ │ ├── base_split2_config.py
│ │ ├── base_split3_config.py
│ │ ├── base_split4_config.py
│ │ ├── base_split5_config.py
│ │ ├── graph_split1_config.py
│ │ ├── graph_split2_config.py
│ │ ├── graph_split3_config.py
│ │ ├── graph_split4_config.py
│ │ └── graph_split5_config.py
│ ├── 1shots/
│ │ ├── base_split1_config.py
│ │ ├── base_split2_config.py
│ │ ├── base_split3_config.py
│ │ ├── base_split4_config.py
│ │ ├── base_split5_config.py
│ │ ├── graph_split1_config.py
│ │ ├── graph_split2_config.py
│ │ ├── graph_split3_config.py
│ │ ├── graph_split4_config.py
│ │ └── graph_split5_config.py
│ ├── 5shot-swin/
│ │ ├── base_split1_config.py
│ │ ├── base_split2_config.py
│ │ ├── base_split3_config.py
│ │ ├── base_split4_config.py
│ │ ├── base_split5_config.py
│ │ ├── graph_split1_config.py
│ │ ├── graph_split2_config.py
│ │ ├── graph_split3_config.py
│ │ ├── graph_split4_config.py
│ │ └── graph_split5_config.py
│ ├── 5shots/
│ │ ├── base_split1_config.py
│ │ ├── base_split2_config.py
│ │ ├── base_split3_config.py
│ │ ├── base_split4_config.py
│ │ ├── base_split5_config.py
│ │ ├── graph_split1_config.py
│ │ ├── graph_split2_config.py
│ │ ├── graph_split3_config.py
│ │ ├── graph_split4_config.py
│ │ └── graph_split5_config.py
│ └── demo_b.py
├── demo.py
├── docker/
│ └── Dockerfile
├── models/
│ ├── VERSION
│ ├── __init__.py
│ ├── apis/
│ │ ├── __init__.py
│ │ └── train.py
│ ├── core/
│ │ ├── __init__.py
│ │ └── custom_hooks/
│ │ └── shuffle_hooks.py
│ ├── datasets/
│ │ ├── __init__.py
│ │ ├── builder.py
│ │ ├── datasets/
│ │ │ ├── __init__.py
│ │ │ └── mp100/
│ │ │ ├── __init__.py
│ │ │ ├── fewshot_base_dataset.py
│ │ │ ├── fewshot_dataset.py
│ │ │ ├── test_base_dataset.py
│ │ │ ├── test_dataset.py
│ │ │ ├── transformer_base_dataset.py
│ │ │ └── transformer_dataset.py
│ │ └── pipelines/
│ │ ├── __init__.py
│ │ ├── post_transforms.py
│ │ └── top_down_transform.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── backbones/
│ │ │ ├── __init__.py
│ │ │ ├── simmim.py
│ │ │ ├── swin_mlp.py
│ │ │ ├── swin_transformer.py
│ │ │ ├── swin_transformer_moe.py
│ │ │ ├── swin_transformer_v2.py
│ │ │ └── swin_utils.py
│ │ ├── detectors/
│ │ │ ├── __init__.py
│ │ │ └── pam.py
│ │ ├── keypoint_heads/
│ │ │ ├── __init__.py
│ │ │ └── head.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── builder.py
│ │ ├── encoder_decoder.py
│ │ ├── positional_encoding.py
│ │ └── transformer.py
│ └── version.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── test.py
├── tools/
│ ├── dist_test.sh
│ ├── dist_train.sh
│ ├── fix_carfuxion.py
│ ├── slurm_test.sh
│ ├── slurm_train.sh
│ └── visualization.py
└── train.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
.eggs/*
.vscode/*
work_dirs/*
work_dir/*
pretrained/*
ckpt/*
runai_dataset/*
*/__pycache__
*.pyc
data/*
data
output/*
.idea/*
pose_anything.egg-info/*
================================================
FILE: LICENSE
================================================
Copyright (c) 2022 SenseTime. All Rights Reserved.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2020 MMClassification Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
:new: *Please check out [EdgeCape](https://github.com/orhir/EdgeCape), our more recent effort in the same line of work.*
# A Graph-Based Approach for Category-Agnostic Pose Estimation [ECCV 2024]
[](https://huggingface.co/spaces/orhir/PoseAnything)
[](https://openxlab.org.cn/apps/detail/orhir/Pose-Anything)
[](https://paperswithcode.com/sota/2d-pose-estimation-on-mp-100?p=pose-anything-a-graph-based-approach-for)
By [Or Hirschorn](https://scholar.google.co.il/citations?user=GgFuT_QAAAAJ&hl=iw&oi=ao) and [Shai Avidan](https://scholar.google.co.il/citations?hl=iw&user=hpItE1QAAAAJ)
This repo is the official implementation of "[A Graph-Based Approach for Category-Agnostic Pose Estimation](https://arxiv.org/pdf/2311.17891.pdf)".
## 🔔 News
- **`11 July 2024`** Our paper will be presented at **ECCV 2024**.
- **`10 July 2024`** Uploaded new annotations - fix a small bug of DeepFashion skeletons.
- **`2 Feburary 2024`** Uploaded new weights - smaller models with stronger performance.
- **`20 December 2023`** Demo is online on [Huggingface](https://huggingface.co/spaces/orhir/PoseAnything) and [OpenXLab](https://openxlab.org.cn/apps/detail/orhir/Pose-Anything).
- **`7 December 2023`** Official code release.
## Introduction
We present a novel approach to CAPE that leverages the inherent geometrical relations between keypoints through a newly designed Graph Transformer Decoder. By capturing and incorporating this crucial structural information, our method enhances the accuracy of keypoint localization, marking a significant departure from conventional CAPE techniques that treat keypoints as isolated entities.
## Citation
If you find this useful, please cite this work as follows:
```bibtex
@misc{hirschorn2023pose,
title={A Graph-Based Approach for Category-Agnostic Pose Estimation},
author={Or Hirschorn and Shai Avidan},
year={2024},
eprint={2311.17891},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2311.17891},
}
```
## Getting Started
### Docker [Recommended]
We provide a docker image for easy use.
You can simply pull the docker image from docker hub, containing all the required libraries and packages:
```
docker pull orhir/pose_anything
docker run --name pose_anything -v {DATA_DIR}:/workspace/PoseAnything/PoseAnything/data/mp100 -it orhir/pose_anything /bin/bash
```
### Conda Environment
We train and evaluate our model on Python 3.8 and Pytorch 2.0.1 with CUDA 12.1.
Please first install pytorch and torchvision following official documentation Pytorch.
Then, follow [MMPose](https://mmpose.readthedocs.io/en/latest/installation.html) to install the following packages:
```
mmcv-full=1.6.2
mmpose=0.29.0
```
Having installed these packages, run:
```
python setup.py develop
```
## Demo on Custom Images
TRY IT NOW ON: HuggingFace / OpenXLab
We provide a demo code to test our code on custom images.
### Gradio Demo
We first require to install gradio:
```
pip install gradio==3.44.0
```
Then, Download the [pretrained model](https://drive.google.com/file/d/1RT1Q8AMEa1kj6k9ZqrtWIKyuR4Jn4Pqc/view?usp=drive_link) and run:
```
python app.py --checkpoint [path_to_pretrained_ckpt]
```
### Terminal Demo
Download
the [pretrained model](https://drive.google.com/file/d/1RT1Q8AMEa1kj6k9ZqrtWIKyuR4Jn4Pqc/view?usp=drive_link)
and run:
```
python demo.py --support [path_to_support_image] --query [path_to_query_image] --config configs/demo_b.py --checkpoint [path_to_pretrained_ckpt]
```
***Note:*** The demo code supports any config with suitable checkpoint file. More pre-trained models can be found in the evaluation section.
## Updated MP-100 Dataset
Please follow the [official guide](https://github.com/luminxu/Pose-for-Everything/blob/main/mp100/README.md) to prepare the MP-100 dataset for training and evaluation, and organize the data structure properly.
**We provide an updated annotation file, which includes skeleton definitions, in the following [link](https://drive.google.com/drive/folders/1uRyGB-P5Tc_6TmAZ6RnOi0SWjGq9b28T?usp=sharing).**
**Please note:**
Current version of the MP-100 dataset includes some discrepancies and filenames errors:
1. Note that the mentioned DeepFasion dataset is actually DeepFashion2 dataset. The link in the official repo is wrong. Use this [repo](https://github.com/switchablenorms/DeepFashion2/tree/master) instead.
2. We provide a script to fix CarFusion filename errors, which can be run by:
```
python tools/fix_carfusion.py [path_to_CarFusion_dataset] [path_to_mp100_annotation]
```
## Training
### Backbone Options
To use pre-trained Swin-Transformer as used in our paper, we provide the weights, taken from this [repo](https://github.com/microsoft/Swin-Transformer/blob/main/MODELHUB.md), in the following [link](https://drive.google.com/drive/folders/1-q4mSxlNAUwDlevc3Hm5Ij0l_2OGkrcg?usp=sharing).
These should be placed in the `./pretrained` folder.
We also support DINO and ResNet backbones. To use them, you can easily change the config file to use the desired backbone.
This can be done by changing the `pretrained` field in the config file to `dinov2`, `dino` or `resnet` respectively (this will automatically load the pretrained weights from the official repo).
### Training
To train the model, run:
```
python train.py --config [path_to_config_file] --work-dir [path_to_work_dir]
```
## Evaluation and Pretrained Models
You can download the pretrained checkpoints from following [link](https://drive.google.com/drive/folders/1RmrqzE3g0qYRD5xn54-aXEzrIkdYXpEW?usp=sharing).
Here we provide the evaluation results of our pretrained models on MP-100 dataset along with the config files and checkpoints:
### 1-Shot Models
| Setting | split 1 | split 2 | split 3 | split 4 | split 5 |
|:-------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|
| Tiny | 91.19 | 87.81 | 85.68 | 85.87 | 85.61 |
| | [link](https://drive.google.com/file/d/1GubmkVkqybs-eD4hiRkgBzkUVGE_rIFX/view?usp=drive_link) / [config](configs/1shots/graph_split1_config.py) | [link](https://drive.google.com/file/d/1EEekDF3xV_wJOVk7sCQWUA8ygUKzEm2l/view?usp=drive_link) / [config](configs/1shots/graph_split2_config.py) | [link](https://drive.google.com/file/d/1FuwpNBdPI3mfSovta2fDGKoqJynEXPZQ/view?usp=drive_link) / [config](configs/1shots/graph_split3_config.py) | [link](https://drive.google.com/file/d/1_SSqSANuZlbC0utzIfzvZihAW9clefcR/view?usp=drive_link) / [config](configs/1shots/graph_split4_config.py) | [link](https://drive.google.com/file/d/1nUHr07W5F55u-FKQEPFq_CECgWZOKKLF/view?usp=drive_link) / [config](configs/1shots/graph_split5_config.py) |
| Small | 94.73 | 89.79 | 90.69 | 88.09 | 90.11 |
| | [link](https://drive.google.com/file/d/1RT1Q8AMEa1kj6k9ZqrtWIKyuR4Jn4Pqc/view?usp=drive_link) / [config](configs/1shot-swin/graph_split1_config.py) | [link](https://drive.google.com/file/d/1BT5b8MlnkflcdhTFiBROIQR3HccLsPQd/view?usp=drive_link) / [config](configs/1shot-swin/graph_split2_config.py) | [link](https://drive.google.com/file/d/1Z64cw_1CSDGObabSAWKnMK0BA_bqDHxn/view?usp=drive_link) / [config](configs/1shot-swin/graph_split3_config.py) | [link](https://drive.google.com/file/d/1vf82S8LAjIzpuBcbEoDCa26cR8DqNriy/view?usp=drive_link) / [config](configs/1shot-swin/graph_split4_config.py) | [link](https://drive.google.com/file/d/14FNx0JNbkS2CvXQMiuMU_kMZKFGO2rDV/view?usp=drive_link) / [config](configs/1shot-swin/graph_split5_config.py) |
### 5-Shot Models
| Setting | split 1 | split 2 | split 3 | split 4 | split 5 |
|:-------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:|
| Tiny | 94.24 | 91.32 | 90.15 | 90.37 | 89.73 |
| | [link](https://drive.google.com/file/d/1PeMuwv5YwiF3UCE5oN01Qchu5K3BaQ9L/view?usp=drive_link) / [config](configs/5shots/graph_split1_config.py) | [link](https://drive.google.com/file/d/1enIapPU1D8lZOET7q_qEjnhC1HFy3jWK/view?usp=drive_link) / [config](configs/5shots/graph_split2_config.py) | [link](https://drive.google.com/file/d/1MTeZ9Ba-ucLuqX0KBoLbBD5PaEct7VUp/view?usp=drive_link) / [config](configs/5shots/graph_split3_config.py) | [link](https://drive.google.com/file/d/1U2N7DI2F0v7NTnPCEEAgx-WKeBZNAFoa/view?usp=drive_link) / [config](configs/5shots/graph_split4_config.py) | [link](https://drive.google.com/file/d/1wapJDgtBWtmz61JNY7ktsFyvckRKiR2C/view?usp=drive_link) / [config](configs/5shots/graph_split5_config.py) |
| Small | 96.67 | 91.48 | 92.62 | 90.95 | 92.41 |
| | [link](https://drive.google.com/file/d/1p5rnA0MhmndSKEbyXMk49QXvNE03QV2p/view?usp=drive_link) / [config](configs/5shot-swin/graph_split1_config.py) | [link](https://drive.google.com/file/d/1Q3KNyUW_Gp3JytYxUPhkvXFiDYF6Hv8w/view?usp=drive_link) / [config](configs/5shot-swin/graph_split2_config.py) | [link](https://drive.google.com/file/d/1gWgTk720fSdAf_ze1FkfXTW0t7k-69dV/view?usp=drive_link) / [config](configs/5shot-swin/graph_split3_config.py) | [link](https://drive.google.com/file/d/1LuaRQ8a6AUPrkr7l5j2W6Fe_QbgASkwY/view?usp=drive_link) / [config](configs/5shot-swin/graph_split4_config.py) | [link](https://drive.google.com/file/d/1z--MAOPCwMG_GQXru9h2EStbnIvtHv1L/view?usp=drive_link) / [config](configs/5shot-swin/graph_split5_config.py) |
### Evaluation
The evaluation on a single GPU will take approximately 30 min.
To evaluate the pretrained model, run:
```
python test.py [path_to_config_file] [path_to_pretrained_ckpt]
```
## Acknowledgement
Our code is based on code from:
- [MMPose](https://github.com/open-mmlab/mmpose)
- [CapeFormer](https://github.com/flyinglynx/CapeFormer)
## License
This project is released under the Apache 2.0 license.
================================================
FILE: app.py
================================================
import argparse
import random
import gradio as gr
import matplotlib
import numpy as np
import torch
from PIL import ImageDraw, Image
from matplotlib import pyplot as plt
from mmcv import Config
from mmcv.runner import load_checkpoint
from mmpose.core import wrap_fp16_model
from mmpose.models import build_posenet
from torchvision import transforms
from demo import Resize_Pad
from models import *
# Copyright (c) OpenMMLab. All rights reserved.
# os.system('python -m pip install timm')
# os.system('python -m pip install Openmim')
# os.system('python -m mim install mmengine')
# os.system('python -m mim install "mmcv-full==1.6.2"')
# os.system('python -m mim install "mmpose==0.29.0"')
# os.system('python -m mim install "gradio==3.44.0"')
# os.system('python setup.py develop')
matplotlib.use('agg')
checkpoint_path = ''
def plot_results(support_img, query_img, support_kp, support_w, query_kp,
query_w, skeleton,
initial_proposals, prediction, radius=6):
h, w, c = support_img.shape
prediction = prediction[-1].cpu().numpy() * h
query_img = (query_img - np.min(query_img)) / (
np.max(query_img) - np.min(query_img))
for id, (img, w, keypoint) in enumerate(zip([query_img],
[query_w],
[prediction])):
f, axes = plt.subplots()
plt.imshow(img)
for k in range(keypoint.shape[0]):
if w[k] > 0:
kp = keypoint[k, :2]
c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6)
patch = plt.Circle(kp, radius, color=c)
axes.add_patch(patch)
axes.text(kp[0], kp[1], k)
plt.draw()
for l, limb in enumerate(skeleton):
kp = keypoint[:, :2]
if l > len(COLORS) - 1:
c = [x / 255 for x in random.sample(range(0, 255), 3)]
else:
c = [x / 255 for x in COLORS[l]]
if w[limb[0]] > 0 and w[limb[1]] > 0:
patch = plt.Line2D([kp[limb[0], 0], kp[limb[1], 0]],
[kp[limb[0], 1], kp[limb[1], 1]],
linewidth=6, color=c, alpha=0.6)
axes.add_artist(patch)
plt.axis('off') # command for hiding the axis.
plt.subplots_adjust(0, 0, 1, 1, 0, 0)
return plt
COLORS = [
[255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
[85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
[0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
[255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]
]
def process(query_img, state,
cfg_path='configs/demo_b.py'):
cfg = Config.fromfile(cfg_path)
width, height, _ = state['original_support_image'].shape
kp_src_np = np.array(state['kp_src']).copy().astype(np.float32)
kp_src_np[:, 0] = kp_src_np[:, 0] / (
width // 4) * cfg.model.encoder_config.img_size
kp_src_np[:, 1] = kp_src_np[:, 1] / (
height // 4) * cfg.model.encoder_config.img_size
kp_src_np = np.flip(kp_src_np, 1).copy()
kp_src_tensor = torch.tensor(kp_src_np).float()
preprocess = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
Resize_Pad(cfg.model.encoder_config.img_size,
cfg.model.encoder_config.img_size)])
if len(state['skeleton']) == 0:
state['skeleton'] = [(0, 0)]
support_img = preprocess(state['original_support_image']).flip(0)[None]
np_query = np.array(query_img)[:, :, ::-1].copy()
q_img = preprocess(np_query).flip(0)[None]
# Create heatmap from keypoints
genHeatMap = TopDownGenerateTargetFewShot()
data_cfg = cfg.data_cfg
data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size,
cfg.model.encoder_config.img_size])
data_cfg['joint_weights'] = None
data_cfg['use_different_joint_weights'] = False
kp_src_3d = torch.cat(
(kp_src_tensor, torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
kp_src_3d_weight = torch.cat(
(torch.ones_like(kp_src_tensor),
torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg,
kp_src_3d,
kp_src_3d_weight,
sigma=1)
target_s = torch.tensor(target_s).float()[None]
target_weight_s = torch.ones_like(
torch.tensor(target_weight_s).float()[None])
data = {
'img_s': [support_img],
'img_q': q_img,
'target_s': [target_s],
'target_weight_s': [target_weight_s],
'target_q': None,
'target_weight_q': None,
'return_loss': False,
'img_metas': [{'sample_skeleton': [state['skeleton']],
'query_skeleton': state['skeleton'],
'sample_joints_3d': [kp_src_3d],
'query_joints_3d': kp_src_3d,
'sample_center': [kp_src_tensor.mean(dim=0)],
'query_center': kp_src_tensor.mean(dim=0),
'sample_scale': [
kp_src_tensor.max(dim=0)[0] -
kp_src_tensor.min(dim=0)[0]],
'query_scale': kp_src_tensor.max(dim=0)[0] -
kp_src_tensor.min(dim=0)[0],
'sample_rotation': [0],
'query_rotation': 0,
'sample_bbox_score': [1],
'query_bbox_score': 1,
'query_image_file': '',
'sample_image_file': [''],
}]
}
# Load model
model = build_posenet(cfg.model)
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
load_checkpoint(model, checkpoint_path, map_location='cpu')
model.eval()
with torch.no_grad():
outputs = model(**data)
# visualize results
vis_s_weight = target_weight_s[0]
vis_q_weight = target_weight_s[0]
vis_s_image = support_img[0].detach().cpu().numpy().transpose(1, 2, 0)
vis_q_image = q_img[0].detach().cpu().numpy().transpose(1, 2, 0)
support_kp = kp_src_3d
out = plot_results(vis_s_image,
vis_q_image,
support_kp,
vis_s_weight,
None,
vis_q_weight,
state['skeleton'],
None,
torch.tensor(outputs['points']).squeeze(0),
)
return out, state
def update_examples(support_img, posed_support, query_img, state, r=0.015, width=0.02):
state['color_idx'] = 0
state['original_support_image'] = np.array(support_img)[:, :, ::-1].copy()
support_img, posed_support, _ = set_query(support_img, state, example=True)
w, h = support_img.size
draw_pose = ImageDraw.Draw(support_img)
draw_limb = ImageDraw.Draw(posed_support)
r = int(r * w)
width = int(width * w)
for pixel in state['kp_src']:
leftUpPoint = (pixel[1] - r, pixel[0] - r)
rightDownPoint = (pixel[1] + r, pixel[0] + r)
twoPointList = [leftUpPoint, rightDownPoint]
draw_pose.ellipse(twoPointList, fill=(255, 0, 0, 255))
draw_limb.ellipse(twoPointList, fill=(255, 0, 0, 255))
for limb in state['skeleton']:
point_a = state['kp_src'][limb[0]][::-1]
point_b = state['kp_src'][limb[1]][::-1]
if state['color_idx'] < len(COLORS):
c = COLORS[state['color_idx']]
state['color_idx'] += 1
else:
c = random.choices(range(256), k=3)
draw_limb.line([point_a, point_b], fill=tuple(c), width=width)
return support_img, posed_support, query_img, state
def get_select_coords(kp_support,
limb_support,
state,
evt: gr.SelectData,
r=0.015):
pixels_in_queue = set()
pixels_in_queue.add((evt.index[1], evt.index[0]))
while len(pixels_in_queue) > 0:
pixel = pixels_in_queue.pop()
if pixel[0] is not None and pixel[1] is not None and pixel not in \
state['kp_src']:
state['kp_src'].append(pixel)
else:
continue
if limb_support is None:
canvas_limb = kp_support
else:
canvas_limb = limb_support
canvas_kp = kp_support
w, h = canvas_kp.size
draw_pose = ImageDraw.Draw(canvas_kp)
draw_limb = ImageDraw.Draw(canvas_limb)
r = int(r * w)
leftUpPoint = (pixel[1] - r, pixel[0] - r)
rightDownPoint = (pixel[1] + r, pixel[0] + r)
twoPointList = [leftUpPoint, rightDownPoint]
draw_pose.ellipse(twoPointList, fill=(255, 0, 0, 255))
draw_limb.ellipse(twoPointList, fill=(255, 0, 0, 255))
return canvas_kp, canvas_limb, state
def get_limbs(kp_support,
state,
evt: gr.SelectData,
r=0.02, width=0.02):
curr_pixel = (evt.index[1], evt.index[0])
pixels_in_queue = set()
pixels_in_queue.add((evt.index[1], evt.index[0]))
canvas_kp = kp_support
w, h = canvas_kp.size
r = int(r * w)
width = int(width * w)
while len(pixels_in_queue) > 0 and curr_pixel != state['prev_clicked']:
pixel = pixels_in_queue.pop()
state['prev_clicked'] = pixel
closest_point = min(state['kp_src'],
key=lambda p: (p[0] - pixel[0]) ** 2 +
(p[1] - pixel[1]) ** 2)
closest_point_index = state['kp_src'].index(closest_point)
draw_limb = ImageDraw.Draw(canvas_kp)
if state['color_idx'] < len(COLORS):
c = COLORS[state['color_idx']]
else:
c = random.choices(range(256), k=3)
leftUpPoint = (closest_point[1] - r, closest_point[0] - r)
rightDownPoint = (closest_point[1] + r, closest_point[0] + r)
twoPointList = [leftUpPoint, rightDownPoint]
draw_limb.ellipse(twoPointList, fill=tuple(c))
if state['count'] == 0:
state['prev_pt'] = closest_point[1], closest_point[0]
state['prev_pt_idx'] = closest_point_index
state['count'] = state['count'] + 1
else:
if state['prev_pt_idx'] != closest_point_index:
# Create Line and add Limb
draw_limb.line(
[state['prev_pt'], (closest_point[1], closest_point[0])],
fill=tuple(c),
width=width)
state['skeleton'].append(
(state['prev_pt_idx'], closest_point_index))
state['color_idx'] = state['color_idx'] + 1
else:
draw_limb.ellipse(twoPointList, fill=(255, 0, 0, 255))
state['count'] = 0
return canvas_kp, state
def set_query(support_img, state, example=False):
if not example:
state['skeleton'].clear()
state['kp_src'].clear()
state['original_support_image'] = np.array(support_img)[:, :, ::-1].copy()
width, height = support_img.size
support_img = support_img.resize((width // 4, width // 4),
Image.Resampling.LANCZOS)
return support_img, support_img, state
with gr.Blocks() as demo:
state = gr.State({
'kp_src': [],
'skeleton': [],
'count': 0,
'color_idx': 0,
'prev_pt': None,
'prev_pt_idx': None,
'prev_clicked': None,
'original_support_image': None,
})
gr.Markdown('''
# Pose Anything Demo
We present a novel approach to category agnostic pose estimation that
leverages the inherent geometrical relations between keypoints through a
newly designed Graph Transformer Decoder. By capturing and incorporating
this crucial structural information, our method enhances the accuracy of
keypoint localization, marking a significant departure from conventional
CAPE techniques that treat keypoints as isolated entities.
### [Paper](https://arxiv.org/abs/2311.17891) | [Official Repo](https://github.com/orhir/PoseAnything)
## Instructions
1. Upload an image of the object you want to pose on the **left** image.
2. Click on the **left** image to mark keypoints.
3. Click on the keypoints on the **right** image to mark limbs.
4. Upload an image of the object you want to pose to the query image (
**bottom**).
5. Click **Evaluate** to pose the query image.
''')
with gr.Row():
support_img = gr.Image(label="Support Image",
type="pil",
info='Click to mark keypoints').style(
height=400, width=400)
posed_support = gr.Image(label="Posed Support Image",
type="pil",
interactive=False).style(height=400,
width=400)
with gr.Row():
query_img = gr.Image(label="Query Image",
type="pil").style(height=400, width=400)
with gr.Row():
eval_btn = gr.Button(value="Evaluate")
with gr.Row():
output_img = gr.Plot(label="Output Image", height=400, width=400)
with gr.Row():
gr.Markdown("## Examples")
with gr.Row():
gr.Examples(
examples=[
['examples/dog2.png',
'examples/dog2.png',
'examples/dog1.png',
{'kp_src': [(50, 58), (51, 78), (66, 57), (118, 79),
(154, 79), (217, 74), (218, 103), (156, 104),
(152, 151), (215, 162), (213, 191),
(152, 174), (108, 171)],
'skeleton': [(0, 1), (1, 2), (0, 2), (3, 4), (4, 5),
(3, 7), (7, 6), (3, 12), (12, 8), (8, 9),
(12, 11), (11, 10)], 'count': 0,
'color_idx': 0, 'prev_pt': (174, 152),
'prev_pt_idx': 11, 'prev_clicked': (207, 186),
'original_support_image': None,
}
],
['examples/sofa1.jpg',
'examples/sofa1.jpg',
'examples/sofa2.jpg',
{
'kp_src': [(82, 28), (65, 30), (52, 26), (65, 50),
(84, 52), (53, 54), (43, 52), (45, 71),
(81, 69), (77, 39), (57, 43), (58, 64),
(46, 42), (49, 65)],
'skeleton': [(0, 1), (3, 1), (3, 4), (10, 9), (11, 8),
(1, 10), (10, 11), (11, 3), (1, 2), (7, 6),
(5, 13), (5, 3), (13, 11), (12, 10), (12, 2),
(6, 10), (7, 11)], 'count': 0,
'color_idx': 23, 'prev_pt': (71, 45), 'prev_pt_idx': 7,
'prev_clicked': (56, 63),
'original_support_image': None,
}],
['examples/person1.jpeg',
'examples/person1.jpeg',
'examples/person2.jpeg',
{
'kp_src': [(121, 95), (122, 160), (154, 130), (184, 106),
(181, 153)],
'skeleton': [(0, 1), (1, 2), (0, 2), (2, 3), (2, 4),
(4, 3)], 'count': 0, 'color_idx': 6,
'prev_pt': (153, 181), 'prev_pt_idx': 4,
'prev_clicked': (181, 108),
'original_support_image': None,
}]
],
inputs=[support_img, posed_support, query_img, state],
outputs=[support_img, posed_support, query_img, state],
fn=update_examples,
run_on_click=True,
)
support_img.select(get_select_coords,
[support_img, posed_support, state],
[support_img, posed_support, state])
support_img.upload(set_query,
inputs=[support_img, state],
outputs=[support_img, posed_support, state])
posed_support.select(get_limbs,
[posed_support, state],
[posed_support, state])
eval_btn.click(fn=process,
inputs=[query_img, state],
outputs=[output_img, state])
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Pose Anything Demo')
parser.add_argument('--checkpoint',
help='checkpoint path',
default='https://github.com/orhir/PoseAnything'
'/releases/download/1.0.0/demo_b.pth')
args = parser.parse_args()
checkpoint_path = args.checkpoint
demo.launch()
================================================
FILE: configs/1shot-swin/base_split1_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shot-swin/base_split2_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shot-swin/base_split3_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shot-swin/base_split4_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shot-swin/base_split5_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shot-swin/graph_split1_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shot-swin/graph_split2_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shot-swin/graph_split3_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shot-swin/graph_split4_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shot-swin/graph_split5_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shots/base_split1_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=16,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shots/base_split2_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=16,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shots/base_split3_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=16,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shots/base_split4_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=16,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shots/base_split5_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=16,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shots/graph_split1_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=16,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shots/graph_split2_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=16,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shots/graph_split3_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=16,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shots/graph_split4_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=16,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/1shots/graph_split5_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=16,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shot-swin/base_split1_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shot-swin/base_split2_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shot-swin/base_split3_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shot-swin/base_split4_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shot-swin/base_split5_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shot-swin/graph_split1_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shot-swin/graph_split2_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shot-swin/graph_split3_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shot-swin/graph_split4_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shot-swin/graph_split5_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_small_1k_500k.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shots/base_split1_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shots/base_split2_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shots/base_split3_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shots/base_split4_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shots/base_split5_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shots/graph_split1_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shots/graph_split2_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shots/graph_split3_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shots/graph_split4_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/5shots/graph_split5_config.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='pretrained/swinv2_tiny_patch4_window16_256.pth',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.2,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=5,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: configs/demo_b.py
================================================
log_level = 'INFO'
load_from = None
resume_from = None
dist_params = dict(backend='nccl')
workflow = [('train', 1)]
checkpoint_config = dict(interval=20)
evaluation = dict(
interval=25,
metric=['PCK', 'NME', 'AUC', 'EPE'],
key_indicator='PCK',
gpu_collect=True,
res_folder='')
optimizer = dict(
type='Adam',
lr=1e-5,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=0.001,
step=[160, 180])
total_epochs = 200
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
channel_cfg = dict(
num_output_channels=1,
dataset_joints=1,
dataset_channel=[
[
0,
],
],
inference_channel=[
0,
],
max_kpt_num=100)
# model settings
model = dict(
type='PoseAnythingModel',
pretrained='swinv2_small',
encoder_config=dict(
type='SwinTransformerV2',
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=16,
drop_path_rate=0.3,
img_size=256,
upsample="bilinear"
),
keypoint_head=dict(
type='PoseHead',
in_channels=768,
transformer=dict(
type='EncoderDecoder',
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder='pre',
dim_feedforward=768,
dropout=0.1,
similarity_proj_dim=256,
dynamic_proj_dim=128,
activation="relu",
normalize_before=False,
return_intermediate_dec=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=True,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
positional_encoding=dict(
type='SinePositionalEncoding', num_feats=128, normalize=True)),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(
flip_test=False,
post_process='default',
shift_heatmap=True,
modulate_kernel=11))
data_cfg = dict(
image_size=[256, 256],
heatmap_size=[64, 64],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'])
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=15,
scale_factor=0.15),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
]),
]
valid_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownAffineFewShot'),
dict(type='ToTensor'),
dict(
type='NormalizeTensor',
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
dict(type='TopDownGenerateTargetFewShot', sigma=1),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs', 'category_id',
'skeleton',
]),
]
test_pipeline = valid_pipeline
data_root = 'data/mp100'
data = dict(
samples_per_gpu=8,
workers_per_gpu=8,
train=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
pipeline=train_pipeline),
val=dict(
type='TransformerPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=100,
pipeline=valid_pipeline),
test=dict(
type='TestPoseDataset',
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
img_prefix=f'{data_root}/images/',
# img_prefix=f'{data_root}',
data_cfg=data_cfg,
valid_class_ids=None,
max_kpt_num=channel_cfg['max_kpt_num'],
num_shots=1,
num_queries=15,
num_episodes=200,
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
pipeline=test_pipeline),
)
vis_backends = [
dict(type='LocalVisBackend'),
dict(type='TensorboardVisBackend'),
]
visualizer = dict(
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
shuffle_cfg = dict(interval=1)
================================================
FILE: demo.py
================================================
import argparse
import copy
import os
import pickle
import random
import cv2
import numpy as np
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.runner import load_checkpoint
from mmpose.core import wrap_fp16_model
from mmpose.models import build_posenet
from torchvision import transforms
from models import *
import torchvision.transforms.functional as F
from tools.visualization import plot_results
COLORS = [
[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
[85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
[0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
[255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]]
class Resize_Pad:
def __init__(self, w=256, h=256):
self.w = w
self.h = h
def __call__(self, image):
_, w_1, h_1 = image.shape
ratio_1 = w_1 / h_1
# check if the original and final aspect ratios are the same within a margin
if round(ratio_1, 2) != 1:
# padding to preserve aspect ratio
if ratio_1 > 1: # Make the image higher
hp = int(w_1 - h_1)
hp = hp // 2
image = F.pad(image, (hp, 0, hp, 0), 0, "constant")
return F.resize(image, [self.h, self.w])
else:
wp = int(h_1 - w_1)
wp = wp // 2
image = F.pad(image, (0, wp, 0, wp), 0, "constant")
return F.resize(image, [self.h, self.w])
else:
return F.resize(image, [self.h, self.w])
def transform_keypoints_to_pad_and_resize(keypoints, image_size):
trans_keypoints = keypoints.clone()
h, w = image_size[:2]
ratio_1 = w / h
if ratio_1 > 1:
# width is bigger than height - pad height
hp = int(w - h)
hp = hp // 2
trans_keypoints[:, 1] = keypoints[:, 1] + hp
trans_keypoints *= (256. / w)
else:
# height is bigger than width - pad width
wp = int(image_size[1] - image_size[0])
wp = wp // 2
trans_keypoints[:, 0] = keypoints[:, 0] + wp
trans_keypoints *= (256. / h)
return trans_keypoints
def parse_args():
parser = argparse.ArgumentParser(description='Pose Anything Demo')
parser.add_argument('--support', help='Image file')
parser.add_argument('--query', help='Image file')
parser.add_argument('--config', default=None, help='test config file path')
parser.add_argument('--checkpoint', default=None, help='checkpoint file')
parser.add_argument('--outdir', default='output', help='checkpoint file')
parser.add_argument(
'--fuse-conv-bn',
action='store_true',
help='Whether to fuse conv and bn, this will slightly increase'
'the inference speed')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
args = parser.parse_args()
return args
def merge_configs(cfg1, cfg2):
# Merge cfg2 into cfg1
# Overwrite cfg1 if repeated, ignore if value is None.
cfg1 = {} if cfg1 is None else cfg1.copy()
cfg2 = {} if cfg2 is None else cfg2
for k, v in cfg2.items():
if v:
cfg1[k] = v
return cfg1
def main():
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.data.test.test_mode = True
os.makedirs(args.outdir, exist_ok=True)
# Load data
support_img = cv2.imread(args.support)
query_img = cv2.imread(args.query)
if support_img is None or query_img is None:
raise ValueError('Fail to read images')
preprocess = transforms.Compose([
transforms.ToTensor(),
Resize_Pad(cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size)])
# frame = copy.deepcopy(support_img)
padded_support_img = preprocess(support_img).cpu().numpy().transpose(1, 2, 0) * 255
frame = copy.deepcopy(padded_support_img.astype(np.uint8).copy())
kp_src = []
skeleton = []
count = 0
prev_pt = None
prev_pt_idx = None
color_idx = 0
def selectKP(event, x, y, flags, param):
nonlocal kp_src, frame
# if we are in points selection mode, the mouse was clicked,
# list of points with the (x, y) location of the click
# and draw the circle
if event == cv2.EVENT_LBUTTONDOWN:
kp_src.append((x, y))
cv2.circle(frame, (x, y), 2, (0, 0, 255), 1)
cv2.imshow("Source", frame)
if event == cv2.EVENT_RBUTTONDOWN:
kp_src = []
frame = copy.deepcopy(support_img)
cv2.imshow("Source", frame)
def draw_line(event, x, y, flags, param):
nonlocal skeleton, kp_src, frame, count, prev_pt, prev_pt_idx, marked_frame, color_idx
if event == cv2.EVENT_LBUTTONDOWN:
closest_point = min(kp_src, key=lambda p: (p[0] - x) ** 2 + (p[1] - y) ** 2)
closest_point_index = kp_src.index(closest_point)
if color_idx < len(COLORS):
c = COLORS[color_idx]
else:
c = random.choices(range(256), k=3)
color = color_idx
cv2.circle(frame, closest_point, 2, c, 1)
if count == 0:
prev_pt = closest_point
prev_pt_idx = closest_point_index
count = count + 1
cv2.imshow("Source", frame)
else:
cv2.line(frame, prev_pt, closest_point, c, 2)
cv2.imshow("Source", frame)
count = 0
skeleton.append((prev_pt_idx, closest_point_index))
color_idx = color_idx + 1
elif event == cv2.EVENT_RBUTTONDOWN:
frame = copy.deepcopy(marked_frame)
cv2.imshow("Source", frame)
count = 0
color_idx = 0
skeleton = []
prev_pt = None
cv2.namedWindow("Source", cv2.WINDOW_NORMAL)
cv2.resizeWindow('Source', 800, 600)
cv2.setMouseCallback("Source", selectKP)
cv2.imshow("Source", frame)
# keep looping until points have been selected
print('Press any key when finished marking the points!! ')
while True:
if cv2.waitKey(1) > 0:
break
marked_frame = copy.deepcopy(frame)
cv2.setMouseCallback("Source", draw_line)
print('Press any key when finished creating skeleton!!')
while True:
if cv2.waitKey(1) > 0:
break
cv2.destroyAllWindows()
kp_src = torch.tensor(kp_src).float()
preprocess = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
Resize_Pad(cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size)])
if len(skeleton) == 0:
skeleton = [(0, 0)]
support_img = preprocess(support_img).flip(0)[None]
query_img = preprocess(query_img).flip(0)[None]
# Create heatmap from keypoints
genHeatMap = TopDownGenerateTargetFewShot()
data_cfg = cfg.data_cfg
data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size])
data_cfg['joint_weights'] = None
data_cfg['use_different_joint_weights'] = False
kp_src_3d = torch.concatenate((kp_src, torch.zeros(kp_src.shape[0], 1)), dim=-1)
kp_src_3d_weight = torch.concatenate((torch.ones_like(kp_src), torch.zeros(kp_src.shape[0], 1)), dim=-1)
target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg, kp_src_3d, kp_src_3d_weight, sigma=1)
target_s = torch.tensor(target_s).float()[None]
target_weight_s = torch.tensor(target_weight_s).float()[None]
data = {
'img_s': [support_img],
'img_q': query_img,
'target_s': [target_s],
'target_weight_s': [target_weight_s],
'target_q': None,
'target_weight_q': None,
'return_loss': False,
'img_metas': [{'sample_skeleton': [skeleton],
'query_skeleton': skeleton,
'sample_joints_3d': [kp_src_3d],
'query_joints_3d': kp_src_3d,
'sample_center': [kp_src.mean(dim=0)],
'query_center': kp_src.mean(dim=0),
'sample_scale': [kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0]],
'query_scale': kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0],
'sample_rotation': [0],
'query_rotation': 0,
'sample_bbox_score': [1],
'query_bbox_score': 1,
'query_image_file': '',
'sample_image_file': [''],
}]
}
# Load model
model = build_posenet(cfg.model)
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
load_checkpoint(model, args.checkpoint, map_location='cpu')
if args.fuse_conv_bn:
model = fuse_conv_bn(model)
model.eval()
with torch.no_grad():
outputs = model(**data)
# visualize results
vis_s_weight = target_weight_s[0]
vis_q_weight = target_weight_s[0]
vis_s_image = support_img[0].detach().cpu().numpy().transpose(1, 2, 0)
vis_q_image = query_img[0].detach().cpu().numpy().transpose(1, 2, 0)
support_kp = kp_src_3d
plot_results(vis_s_image,
vis_q_image,
support_kp,
vis_s_weight,
None,
vis_q_weight,
skeleton,
None,
torch.tensor(outputs['points']).squeeze(0),
out_dir=args.outdir)
if __name__ == '__main__':
main()
================================================
FILE: docker/Dockerfile
================================================
ARG PYTORCH="2.0.1"
ARG CUDA="11.7"
ARG CUDNN="8"
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
ENV TZ=Asia/Kolkata DEBIAN_FRONTEND=noninteractive
# To fix GPG key error when running apt-get update
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx\
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install xtcocotools
RUN pip install cython
RUN pip install xtcocotools
# Install MMEngine and MMCV
RUN pip install openmim
RUN mim install mmengine
RUN mim install "mmpose==0.28.1"
RUN mim install "mmcv-full==1.5.3"
RUN pip install -U torchmetrics timm
RUN pip install numpy scipy --upgrade
RUN pip install future tensorboard
WORKDIR PoseAnything
COPY models PoseAnything/models
COPY configs PoseAnything/configs
COPY pretrained PoseAnything/pretrained
COPY requirements.txt PoseAnything/
COPY tools PoseAnything/tools
COPY setup.cfg PoseAnything/
COPY setup.py PoseAnything/
COPY test.py PoseAnything/
COPY train.py PoseAnything/
COPY README.md PoseAnything/
RUN mkdir -p PoseAnything/data/mp100
WORKDIR PoseAnything
# Install MMPose
RUN conda clean --all
ENV FORCE_CUDA="1"
RUN python setup.py develop
================================================
FILE: models/VERSION
================================================
0.2.0
================================================
FILE: models/__init__.py
================================================
from .core import * # noqa
from .datasets import * # noqa
from .models import * # noqa
================================================
FILE: models/apis/__init__.py
================================================
from .train import train_model
__all__ = [
'train_model'
]
================================================
FILE: models/apis/train.py
================================================
import os
import torch
from models.core.custom_hooks.shuffle_hooks import ShufflePairedSamplesHook
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook,
build_optimizer)
from mmpose.core import DistEvalHook, EvalHook, Fp16OptimizerHook
from mmpose.datasets import build_dataloader
from mmpose.utils import get_root_logger
def train_model(model,
dataset,
val_dataset,
cfg,
distributed=False,
validate=False,
timestamp=None,
meta=None):
"""Train model entry function.
Args:
model (nn.Module): The model to be trained.
dataset (Dataset): Train dataset.
cfg (dict): The config dict for training.
distributed (bool): Whether to use distributed training.
Default: False.
validate (bool): Whether to do evaluation. Default: False.
timestamp (str | None): Local time for runner. Default: None.
meta (dict | None): Meta dict to record some important information.
Default: None
"""
logger = get_root_logger(cfg.log_level)
# prepare data loaders
dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
dataloader_setting = dict(
samples_per_gpu=cfg.data.get('samples_per_gpu', {}),
workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
# cfg.gpus will be ignored if distributed
num_gpus=len(cfg.gpu_ids),
dist=distributed,
seed=cfg.seed,
pin_memory=False,
)
dataloader_setting = dict(dataloader_setting,
**cfg.data.get('train_dataloader', {}))
data_loaders = [
build_dataloader(ds, **dataloader_setting) for ds in dataset
]
# put model on gpus
if distributed:
find_unused_parameters = cfg.get('find_unused_parameters',
False) # NOTE: True has been modified to False for faster training.
# Sets the `find_unused_parameters` parameter in
# torch.nn.parallel.DistributedDataParallel
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
find_unused_parameters=find_unused_parameters)
else:
model = MMDataParallel(
model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
# build runner
optimizer = build_optimizer(model, cfg.optimizer)
runner = EpochBasedRunner(
model,
optimizer=optimizer,
work_dir=cfg.work_dir,
logger=logger,
meta=meta)
# an ugly workaround to make .log and .log.json filenames the same
runner.timestamp = timestamp
# fp16 setting
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
optimizer_config = Fp16OptimizerHook(
**cfg.optimizer_config, **fp16_cfg, distributed=distributed)
elif distributed and 'type' not in cfg.optimizer_config:
optimizer_config = OptimizerHook(**cfg.optimizer_config)
else:
optimizer_config = cfg.optimizer_config
# register hooks
runner.register_training_hooks(cfg.lr_config, optimizer_config,
cfg.checkpoint_config, cfg.log_config,
cfg.get('momentum_config', None))
if distributed:
runner.register_hook(DistSamplerSeedHook())
shuffle_cfg = cfg.get('shuffle_cfg', None)
if shuffle_cfg is not None:
for data_loader in data_loaders:
runner.register_hook(ShufflePairedSamplesHook(data_loader, **shuffle_cfg))
# register eval hooks
if validate:
eval_cfg = cfg.get('evaluation', {})
eval_cfg['res_folder'] = os.path.join(cfg.work_dir, eval_cfg['res_folder'])
dataloader_setting = dict(
# samples_per_gpu=cfg.data.get('samples_per_gpu', {}),
samples_per_gpu=1,
workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
# cfg.gpus will be ignored if distributed
num_gpus=len(cfg.gpu_ids),
dist=distributed,
shuffle=False,
pin_memory=False,
)
dataloader_setting = dict(dataloader_setting,
**cfg.data.get('val_dataloader', {}))
val_dataloader = build_dataloader(val_dataset, **dataloader_setting)
eval_hook = DistEvalHook if distributed else EvalHook
runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
if cfg.resume_from:
runner.resume(cfg.resume_from)
elif cfg.load_from:
runner.load_checkpoint(cfg.load_from)
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
================================================
FILE: models/core/__init__.py
================================================
================================================
FILE: models/core/custom_hooks/shuffle_hooks.py
================================================
from mmcv.runner import Hook
from mmpose.utils import get_root_logger
from torch.utils.data import DataLoader
class ShufflePairedSamplesHook(Hook):
"""Non-Distributed ShufflePairedSamples.
After each training epoch, run FewShotKeypointDataset.random_paired_samples()
"""
def __init__(self,
dataloader,
interval=1):
if not isinstance(dataloader, DataLoader):
raise TypeError(f'dataloader must be a pytorch DataLoader, '
f'but got {type(dataloader)}')
self.dataloader = dataloader
self.interval = interval
self.logger = get_root_logger()
def after_train_epoch(self, runner):
"""Called after every training epoch to evaluate the results."""
if not self.every_n_epochs(runner, self.interval):
return
# self.logger.info("Run random_paired_samples()")
# self.logger.info(f"Before: {self.dataloader.dataset.paired_samples[0]}")
self.dataloader.dataset.random_paired_samples()
# self.logger.info(f"After: {self.dataloader.dataset.paired_samples[0]}")
================================================
FILE: models/datasets/__init__.py
================================================
from .builder import * # noqa
from .datasets import * # noqa
from .pipelines import * # noqa
================================================
FILE: models/datasets/builder.py
================================================
from mmcv.utils import build_from_cfg
from mmpose.datasets.builder import DATASETS
from mmpose.datasets.dataset_wrappers import RepeatDataset
from torch.utils.data.dataset import ConcatDataset
def _concat_cfg(cfg):
replace = ['ann_file', 'img_prefix']
channels = ['num_joints', 'dataset_channel']
concat_cfg = []
for i in range(len(cfg['type'])):
cfg_tmp = cfg.deepcopy()
cfg_tmp['type'] = cfg['type'][i]
for item in replace:
assert item in cfg_tmp
assert len(cfg['type']) == len(cfg[item]), (cfg[item])
cfg_tmp[item] = cfg[item][i]
for item in channels:
assert item in cfg_tmp['data_cfg']
assert len(cfg['type']) == len(cfg['data_cfg'][item])
cfg_tmp['data_cfg'][item] = cfg['data_cfg'][item][i]
concat_cfg.append(cfg_tmp)
return concat_cfg
def _check_vaild(cfg):
replace = ['num_joints', 'dataset_channel']
if isinstance(cfg['data_cfg'][replace[0]], (list, tuple)):
for item in replace:
cfg['data_cfg'][item] = cfg['data_cfg'][item][0]
return cfg
def build_dataset(cfg, default_args=None):
"""Build a dataset from config dict.
Args:
cfg (dict): Config dict. It should at least contain the key "type".
default_args (dict, optional): Default initialization arguments.
Default: None.
Returns:
Dataset: The constructed dataset.
"""
if isinstance(cfg['type'], (list, tuple)): # In training, type=TransformerPoseDataset
dataset = ConcatDataset(
[build_dataset(c, default_args) for c in _concat_cfg(cfg)])
elif cfg['type'] == 'RepeatDataset':
dataset = RepeatDataset(
build_dataset(cfg['dataset'], default_args), cfg['times'])
else:
cfg = _check_vaild(cfg)
dataset = build_from_cfg(cfg, DATASETS, default_args)
return dataset
================================================
FILE: models/datasets/datasets/__init__.py
================================================
from .mp100 import (FewShotKeypointDataset, FewShotBaseDataset,
TransformerBaseDataset, TransformerPoseDataset)
__all__ = ['FewShotBaseDataset', 'FewShotKeypointDataset',
'TransformerBaseDataset', 'TransformerPoseDataset']
================================================
FILE: models/datasets/datasets/mp100/__init__.py
================================================
from .fewshot_base_dataset import FewShotBaseDataset
from .fewshot_dataset import FewShotKeypointDataset
from .test_base_dataset import TestBaseDataset
from .test_dataset import TestPoseDataset
from .transformer_base_dataset import TransformerBaseDataset
from .transformer_dataset import TransformerPoseDataset
__all__ = [
'FewShotKeypointDataset', 'FewShotBaseDataset',
'TransformerPoseDataset', 'TransformerBaseDataset',
'TestBaseDataset', 'TestPoseDataset'
]
================================================
FILE: models/datasets/datasets/mp100/fewshot_base_dataset.py
================================================
import copy
from abc import ABCMeta, abstractmethod
import json_tricks as json
import numpy as np
from mmcv.parallel import DataContainer as DC
from mmpose.core.evaluation.top_down_eval import (keypoint_pck_accuracy)
from mmpose.datasets import DATASETS
from mmpose.datasets.pipelines import Compose
from torch.utils.data import Dataset
@DATASETS.register_module()
class FewShotBaseDataset(Dataset, metaclass=ABCMeta):
def __init__(self,
ann_file,
img_prefix,
data_cfg,
pipeline,
test_mode=False):
self.image_info = {}
self.ann_info = {}
self.annotations_path = ann_file
if not img_prefix.endswith('/'):
img_prefix = img_prefix + '/'
self.img_prefix = img_prefix
self.pipeline = pipeline
self.test_mode = test_mode
self.ann_info['image_size'] = np.array(data_cfg['image_size'])
self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
self.ann_info['num_joints'] = data_cfg['num_joints']
self.ann_info['flip_pairs'] = None
self.ann_info['inference_channel'] = data_cfg['inference_channel']
self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
self.db = []
self.num_shots = 1
self.paired_samples = []
self.pipeline = Compose(self.pipeline)
@abstractmethod
def _get_db(self):
"""Load dataset."""
raise NotImplementedError
@abstractmethod
def _select_kpt(self, obj, kpt_id):
"""Select kpt."""
raise NotImplementedError
@abstractmethod
def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
"""Evaluate keypoint results."""
raise NotImplementedError
@staticmethod
def _write_keypoint_results(keypoints, res_file):
"""Write results into a json file."""
with open(res_file, 'w') as f:
json.dump(keypoints, f, sort_keys=True, indent=4)
def _report_metric(self,
res_file,
metrics,
pck_thr=0.2,
pckh_thr=0.7,
auc_nor=30):
"""Keypoint evaluation.
Args:
res_file (str): Json file stored prediction results.
metrics (str | list[str]): Metric to be performed.
Options: 'PCK', 'PCKh', 'AUC', 'EPE'.
pck_thr (float): PCK threshold, default as 0.2.
pckh_thr (float): PCKh threshold, default as 0.7.
auc_nor (float): AUC normalization factor, default as 30 pixel.
Returns:
List: Evaluation results for evaluation metric.
"""
info_str = []
with open(res_file, 'r') as fin:
preds = json.load(fin)
assert len(preds) == len(self.paired_samples)
outputs = []
gts = []
masks = []
threshold_bbox = []
threshold_head_box = []
for pred, pair in zip(preds, self.paired_samples):
item = self.db[pair[-1]]
outputs.append(np.array(pred['keypoints'])[:, :-1])
gts.append(np.array(item['joints_3d'])[:, :-1])
mask_query = ((np.array(item['joints_3d_visible'])[:, 0]) > 0)
mask_sample = ((np.array(self.db[pair[0]]['joints_3d_visible'])[:, 0]) > 0)
for id_s in pair[:-1]:
mask_sample = np.bitwise_and(mask_sample, ((np.array(self.db[id_s]['joints_3d_visible'])[:, 0]) > 0))
masks.append(np.bitwise_and(mask_query, mask_sample))
if 'PCK' in metrics:
bbox = np.array(item['bbox'])
bbox_thr = np.max(bbox[2:])
threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
if 'PCKh' in metrics:
head_box_thr = item['head_size']
threshold_head_box.append(
np.array([head_box_thr, head_box_thr]))
if 'PCK' in metrics:
pck_avg = []
for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox):
_, pck, _ = keypoint_pck_accuracy(np.expand_dims(output, 0), np.expand_dims(gt, 0),
np.expand_dims(mask, 0), pck_thr, np.expand_dims(thr_bbox, 0))
pck_avg.append(pck)
info_str.append(('PCK', np.mean(pck_avg)))
return info_str
def _merge_obj(self, Xs_list, Xq, idx):
""" merge Xs_list and Xq.
:param Xs_list: N-shot samples X
:param Xq: query X
:param idx: id of paired_samples
:return: Xall
"""
Xall = dict()
Xall['img_s'] = [Xs['img'] for Xs in Xs_list]
Xall['target_s'] = [Xs['target'] for Xs in Xs_list]
Xall['target_weight_s'] = [Xs['target_weight'] for Xs in Xs_list]
xs_img_metas = [Xs['img_metas'].data for Xs in Xs_list]
Xall['img_q'] = Xq['img']
Xall['target_q'] = Xq['target']
Xall['target_weight_q'] = Xq['target_weight']
xq_img_metas = Xq['img_metas'].data
img_metas = dict()
for key in xq_img_metas.keys():
img_metas['sample_' + key] = [xs_img_meta[key] for xs_img_meta in xs_img_metas]
img_metas['query_' + key] = xq_img_metas[key]
img_metas['bbox_id'] = idx
Xall['img_metas'] = DC(img_metas, cpu_only=True)
return Xall
def __len__(self):
"""Get the size of the dataset."""
return len(self.paired_samples)
def __getitem__(self, idx):
"""Get the sample given index."""
pair_ids = self.paired_samples[idx]
assert len(pair_ids) == self.num_shots + 1
sample_id_list = pair_ids[:self.num_shots]
query_id = pair_ids[-1]
sample_obj_list = []
for sample_id in sample_id_list:
sample_obj = copy.deepcopy(self.db[sample_id])
sample_obj['ann_info'] = copy.deepcopy(self.ann_info)
sample_obj_list.append(sample_obj)
query_obj = copy.deepcopy(self.db[query_id])
query_obj['ann_info'] = copy.deepcopy(self.ann_info)
if not self.test_mode:
# randomly select "one" keypoint
sample_valid = (sample_obj_list[0]['joints_3d_visible'][:, 0] > 0)
for sample_obj in sample_obj_list:
sample_valid = sample_valid & (sample_obj['joints_3d_visible'][:, 0] > 0)
query_valid = (query_obj['joints_3d_visible'][:, 0] > 0)
valid_s = np.where(sample_valid)[0]
valid_q = np.where(query_valid)[0]
valid_sq = np.where(sample_valid & query_valid)[0]
if len(valid_sq) > 0:
kpt_id = np.random.choice(valid_sq)
elif len(valid_s) > 0:
kpt_id = np.random.choice(valid_s)
elif len(valid_q) > 0:
kpt_id = np.random.choice(valid_q)
else:
kpt_id = np.random.choice(np.array(range(len(query_valid))))
for i in range(self.num_shots):
sample_obj_list[i] = self._select_kpt(sample_obj_list[i], kpt_id)
query_obj = self._select_kpt(query_obj, kpt_id)
# when test, all keypoints will be preserved.
Xs_list = []
for sample_obj in sample_obj_list:
Xs = self.pipeline(sample_obj)
Xs_list.append(Xs)
Xq = self.pipeline(query_obj)
Xall = self._merge_obj(Xs_list, Xq, idx)
Xall['skeleton'] = self.db[query_id]['skeleton']
return Xall
def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
"""sort kpts and remove the repeated ones."""
kpts = sorted(kpts, key=lambda x: x[key])
num = len(kpts)
for i in range(num - 1, 0, -1):
if kpts[i][key] == kpts[i - 1][key]:
del kpts[i]
return kpts
================================================
FILE: models/datasets/datasets/mp100/fewshot_dataset.py
================================================
import os
import random
from collections import OrderedDict
import numpy as np
from mmpose.datasets import DATASETS
from xtcocotools.coco import COCO
from .fewshot_base_dataset import FewShotBaseDataset
@DATASETS.register_module()
class FewShotKeypointDataset(FewShotBaseDataset):
def __init__(self,
ann_file,
img_prefix,
data_cfg,
pipeline,
valid_class_ids,
num_shots=1,
num_queries=100,
num_episodes=1,
test_mode=False):
super().__init__(
ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode)
self.ann_info['flip_pairs'] = []
self.ann_info['upper_body_ids'] = []
self.ann_info['lower_body_ids'] = []
self.ann_info['use_different_joint_weights'] = False
self.ann_info['joint_weights'] = np.array([1., ],
dtype=np.float32).reshape((self.ann_info['num_joints'], 1))
self.coco = COCO(ann_file)
self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
self.img_ids = self.coco.getImgIds()
self.classes = [
cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds())
]
self.num_classes = len(self.classes)
self._class_to_ind = dict(zip(self.classes, self.coco.getCatIds()))
self._ind_to_class = dict(zip(self.coco.getCatIds(), self.classes))
if valid_class_ids is not None:
self.valid_class_ids = valid_class_ids
else:
self.valid_class_ids = self.coco.getCatIds()
self.valid_classes = [self._ind_to_class[ind] for ind in self.valid_class_ids]
self.cats = self.coco.cats
# Also update self.cat2obj
self.db = self._get_db()
self.num_shots = num_shots
if not test_mode:
# Update every training epoch
self.random_paired_samples()
else:
self.num_queries = num_queries
self.num_episodes = num_episodes
self.make_paired_samples()
def random_paired_samples(self):
num_datas = [len(self.cat2obj[self._class_to_ind[cls]]) for cls in self.valid_classes]
# balance the dataset
max_num_data = max(num_datas)
all_samples = []
for cls in self.valid_class_ids:
for i in range(max_num_data):
shot = random.sample(self.cat2obj[cls], self.num_shots + 1)
all_samples.append(shot)
self.paired_samples = np.array(all_samples)
np.random.shuffle(self.paired_samples)
def make_paired_samples(self):
random.seed(1)
np.random.seed(0)
all_samples = []
for cls in self.valid_class_ids:
for _ in range(self.num_episodes):
shots = random.sample(self.cat2obj[cls], self.num_shots + self.num_queries)
sample_ids = shots[:self.num_shots]
query_ids = shots[self.num_shots:]
for query_id in query_ids:
all_samples.append(sample_ids + [query_id])
self.paired_samples = np.array(all_samples)
def _select_kpt(self, obj, kpt_id):
obj['joints_3d'] = obj['joints_3d'][kpt_id:kpt_id + 1]
obj['joints_3d_visible'] = obj['joints_3d_visible'][kpt_id:kpt_id + 1]
obj['kpt_id'] = kpt_id
return obj
@staticmethod
def _get_mapping_id_name(imgs):
"""
Args:
imgs (dict): dict of image info.
Returns:
tuple: Image name & id mapping dicts.
- id2name (dict): Mapping image id to name.
- name2id (dict): Mapping image name to id.
"""
id2name = {}
name2id = {}
for image_id, image in imgs.items():
file_name = image['file_name']
id2name[image_id] = file_name
name2id[file_name] = image_id
return id2name, name2id
def _get_db(self):
"""Ground truth bbox and keypoints."""
self.obj_id = 0
self.cat2obj = {}
for i in self.coco.getCatIds():
self.cat2obj.update({i: []})
gt_db = []
for img_id in self.img_ids:
gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
return gt_db
def _load_coco_keypoint_annotation_kernel(self, img_id):
"""load annotation from COCOAPI.
Note:
bbox:[x1, y1, w, h]
Args:
img_id: coco image id
Returns:
dict: db entry
"""
img_ann = self.coco.loadImgs(img_id)[0]
width = img_ann['width']
height = img_ann['height']
ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
objs = self.coco.loadAnns(ann_ids)
# sanitize bboxes
valid_objs = []
for obj in objs:
if 'bbox' not in obj:
continue
x, y, w, h = obj['bbox']
x1 = max(0, x)
y1 = max(0, y)
x2 = min(width - 1, x1 + max(0, w - 1))
y2 = min(height - 1, y1 + max(0, h - 1))
if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
valid_objs.append(obj)
objs = valid_objs
bbox_id = 0
rec = []
for obj in objs:
if 'keypoints' not in obj:
continue
if max(obj['keypoints']) == 0:
continue
if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
continue
category_id = obj['category_id']
# the number of keypoint for this specific category
cat_kpt_num = int(len(obj['keypoints']) / 3)
joints_3d = np.zeros((cat_kpt_num, 3), dtype=np.float32)
joints_3d_visible = np.zeros((cat_kpt_num, 3), dtype=np.float32)
keypoints = np.array(obj['keypoints']).reshape(-1, 3)
joints_3d[:, :2] = keypoints[:, :2]
joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
image_file = os.path.join(self.img_prefix, self.id2name[img_id])
self.cat2obj[category_id].append(self.obj_id)
rec.append({
'image_file': image_file,
'center': center,
'scale': scale,
'rotation': 0,
'bbox': obj['clean_bbox'][:4],
'bbox_score': 1,
'joints_3d': joints_3d,
'joints_3d_visible': joints_3d_visible,
'category_id': category_id,
'cat_kpt_num': cat_kpt_num,
'bbox_id': self.obj_id,
'skeleton': self.coco.cats[obj['category_id']]['skeleton'],
})
bbox_id = bbox_id + 1
self.obj_id += 1
return rec
def _xywh2cs(self, x, y, w, h):
"""This encodes bbox(x,y,w,w) into (center, scale)
Args:
x, y, w, h
Returns:
tuple: A tuple containing center and scale.
- center (np.ndarray[float32](2,)): center of the bbox (x, y).
- scale (np.ndarray[float32](2,)): scale of the bbox w & h.
"""
aspect_ratio = self.ann_info['image_size'][0] / self.ann_info['image_size'][1]
center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
#
# if (not self.test_mode) and np.random.rand() < 0.3:
# center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
if w > aspect_ratio * h:
h = w * 1.0 / aspect_ratio
elif w < aspect_ratio * h:
w = h * aspect_ratio
# pixel std is 200.0
scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
# padding to include proper amount of context
scale = scale * 1.25
return center, scale
def evaluate(self, outputs, res_folder, metric='PCK', **kwargs):
"""Evaluate interhand2d keypoint results. The pose prediction results
will be saved in `${res_folder}/result_keypoints.json`.
Note:
batch_size: N
num_keypoints: K
heatmap height: H
heatmap width: W
Args:
outputs (list(preds, boxes, image_path, output_heatmap))
:preds (np.ndarray[N,K,3]): The first two dimensions are
coordinates, score is the third dimension of the array.
:boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
, scale[1],area, score]
:image_paths (list[str]): For example, ['C', 'a', 'p', 't',
'u', 'r', 'e', '1', '2', '/', '0', '3', '9', '0', '_',
'd', 'h', '_', 't', 'o', 'u', 'c', 'h', 'R', 'O', 'M',
'/', 'c', 'a', 'm', '4', '1', '0', '2', '0', '9', '/',
'i', 'm', 'a', 'g', 'e', '6', '2', '4', '3', '4', '.',
'j', 'p', 'g']
:output_heatmap (np.ndarray[N, K, H, W]): model outpus.
res_folder (str): Path of directory to save the results.
metric (str | list[str]): Metric to be performed.
Options: 'PCK', 'AUC', 'EPE'.
Returns:
dict: Evaluation results for evaluation metric.
"""
metrics = metric if isinstance(metric, list) else [metric]
allowed_metrics = ['PCK', 'AUC', 'EPE']
for metric in metrics:
if metric not in allowed_metrics:
raise KeyError(f'metric {metric} is not supported')
res_file = os.path.join(res_folder, 'result_keypoints.json')
kpts = []
for output in outputs:
preds = output['preds']
boxes = output['boxes']
image_paths = output['image_paths']
bbox_ids = output['bbox_ids']
batch_size = len(image_paths)
for i in range(batch_size):
image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
kpts.append({
'keypoints': preds[i].tolist(),
'center': boxes[i][0:2].tolist(),
'scale': boxes[i][2:4].tolist(),
'area': float(boxes[i][4]),
'score': float(boxes[i][5]),
'image_id': image_id,
'bbox_id': bbox_ids[i]
})
kpts = self._sort_and_unique_bboxes(kpts)
self._write_keypoint_results(kpts, res_file)
info_str = self._report_metric(res_file, metrics)
name_value = OrderedDict(info_str)
return name_value
================================================
FILE: models/datasets/datasets/mp100/test_base_dataset.py
================================================
import copy
from abc import ABCMeta, abstractmethod
import json_tricks as json
import numpy as np
from mmcv.parallel import DataContainer as DC
from mmpose.core.evaluation.top_down_eval import (keypoint_auc, keypoint_epe, keypoint_nme,
keypoint_pck_accuracy)
from mmpose.datasets import DATASETS
from mmpose.datasets.pipelines import Compose
from torch.utils.data import Dataset
@DATASETS.register_module()
class TestBaseDataset(Dataset, metaclass=ABCMeta):
def __init__(self,
ann_file,
img_prefix,
data_cfg,
pipeline,
test_mode=True,
PCK_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25]):
self.image_info = {}
self.ann_info = {}
self.annotations_path = ann_file
if not img_prefix.endswith('/'):
img_prefix = img_prefix + '/'
self.img_prefix = img_prefix
self.pipeline = pipeline
self.test_mode = test_mode
self.PCK_threshold_list = PCK_threshold_list
self.ann_info['image_size'] = np.array(data_cfg['image_size'])
self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
self.ann_info['num_joints'] = data_cfg['num_joints']
self.ann_info['flip_pairs'] = None
self.ann_info['inference_channel'] = data_cfg['inference_channel']
self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
self.db = []
self.num_shots = 1
self.paired_samples = []
self.pipeline = Compose(self.pipeline)
@abstractmethod
def _get_db(self):
"""Load dataset."""
raise NotImplementedError
@abstractmethod
def _select_kpt(self, obj, kpt_id):
"""Select kpt."""
raise NotImplementedError
@abstractmethod
def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
"""Evaluate keypoint results."""
raise NotImplementedError
@staticmethod
def _write_keypoint_results(keypoints, res_file):
"""Write results into a json file."""
with open(res_file, 'w') as f:
json.dump(keypoints, f, sort_keys=True, indent=4)
def _report_metric(self,
res_file,
metrics):
"""Keypoint evaluation.
Args:
res_file (str): Json file stored prediction results.
metrics (str | list[str]): Metric to be performed.
Options: 'PCK', 'PCKh', 'AUC', 'EPE'.
pck_thr (float): PCK threshold, default as 0.2.
pckh_thr (float): PCKh threshold, default as 0.7.
auc_nor (float): AUC normalization factor, default as 30 pixel.
Returns:
List: Evaluation results for evaluation metric.
"""
info_str = []
with open(res_file, 'r') as fin:
preds = json.load(fin)
assert len(preds) == len(self.paired_samples)
outputs = []
gts = []
masks = []
threshold_bbox = []
threshold_head_box = []
for pred, pair in zip(preds, self.paired_samples):
item = self.db[pair[-1]]
outputs.append(np.array(pred['keypoints'])[:, :-1])
gts.append(np.array(item['joints_3d'])[:, :-1])
mask_query = ((np.array(item['joints_3d_visible'])[:, 0]) > 0)
mask_sample = ((np.array(self.db[pair[0]]['joints_3d_visible'])[:, 0]) > 0)
for id_s in pair[:-1]:
mask_sample = np.bitwise_and(mask_sample, ((np.array(self.db[id_s]['joints_3d_visible'])[:, 0]) > 0))
masks.append(np.bitwise_and(mask_query, mask_sample))
if 'PCK' in metrics or 'NME' in metrics or 'AUC' in metrics:
bbox = np.array(item['bbox'])
bbox_thr = np.max(bbox[2:])
threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
if 'PCKh' in metrics:
head_box_thr = item['head_size']
threshold_head_box.append(
np.array([head_box_thr, head_box_thr]))
if 'PCK' in metrics:
pck_results = dict()
for pck_thr in self.PCK_threshold_list:
pck_results[pck_thr] = []
for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox):
for pck_thr in self.PCK_threshold_list:
_, pck, _ = keypoint_pck_accuracy(np.expand_dims(output, 0), np.expand_dims(gt, 0),
np.expand_dims(mask, 0), pck_thr, np.expand_dims(thr_bbox, 0))
pck_results[pck_thr].append(pck)
mPCK = 0
for pck_thr in self.PCK_threshold_list:
info_str.append(['PCK@' + str(pck_thr), np.mean(pck_results[pck_thr])])
mPCK += np.mean(pck_results[pck_thr])
info_str.append(['mPCK', mPCK / len(self.PCK_threshold_list)])
if 'NME' in metrics:
nme_results = []
for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox):
nme = keypoint_nme(np.expand_dims(output, 0), np.expand_dims(gt, 0), np.expand_dims(mask, 0),
np.expand_dims(thr_bbox, 0))
nme_results.append(nme)
info_str.append(['NME', np.mean(nme_results)])
if 'AUC' in metrics:
auc_results = []
for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox):
auc = keypoint_auc(np.expand_dims(output, 0), np.expand_dims(gt, 0), np.expand_dims(mask, 0),
thr_bbox[0])
auc_results.append(auc)
info_str.append(['AUC', np.mean(auc_results)])
if 'EPE' in metrics:
epe_results = []
for (output, gt, mask) in zip(outputs, gts, masks):
epe = keypoint_epe(np.expand_dims(output, 0), np.expand_dims(gt, 0), np.expand_dims(mask, 0))
epe_results.append(epe)
info_str.append(['EPE', np.mean(epe_results)])
return info_str
def _merge_obj(self, Xs_list, Xq, idx):
""" merge Xs_list and Xq.
:param Xs_list: N-shot samples X
:param Xq: query X
:param idx: id of paired_samples
:return: Xall
"""
Xall = dict()
Xall['img_s'] = [Xs['img'] for Xs in Xs_list]
Xall['target_s'] = [Xs['target'] for Xs in Xs_list]
Xall['target_weight_s'] = [Xs['target_weight'] for Xs in Xs_list]
xs_img_metas = [Xs['img_metas'].data for Xs in Xs_list]
Xall['img_q'] = Xq['img']
Xall['target_q'] = Xq['target']
Xall['target_weight_q'] = Xq['target_weight']
xq_img_metas = Xq['img_metas'].data
img_metas = dict()
for key in xq_img_metas.keys():
img_metas['sample_' + key] = [xs_img_meta[key] for xs_img_meta in xs_img_metas]
img_metas['query_' + key] = xq_img_metas[key]
img_metas['bbox_id'] = idx
Xall['img_metas'] = DC(img_metas, cpu_only=True)
return Xall
def __len__(self):
"""Get the size of the dataset."""
return len(self.paired_samples)
def __getitem__(self, idx):
"""Get the sample given index."""
pair_ids = self.paired_samples[idx] # [supported id * shots, query id]
assert len(pair_ids) == self.num_shots + 1
sample_id_list = pair_ids[:self.num_shots]
query_id = pair_ids[-1]
sample_obj_list = []
for sample_id in sample_id_list:
sample_obj = copy.deepcopy(self.db[sample_id])
sample_obj['ann_info'] = copy.deepcopy(self.ann_info)
sample_obj_list.append(sample_obj)
query_obj = copy.deepcopy(self.db[query_id])
query_obj['ann_info'] = copy.deepcopy(self.ann_info)
Xs_list = []
for sample_obj in sample_obj_list:
Xs = self.pipeline(sample_obj) # dict with ['img', 'target', 'target_weight', 'img_metas'],
Xs_list.append(Xs) # Xs['target'] is of shape [100, map_h, map_w]
Xq = self.pipeline(query_obj)
Xall = self._merge_obj(Xs_list, Xq, idx)
Xall['skeleton'] = self.db[query_id]['skeleton']
return Xall
def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
"""sort kpts and remove the repeated ones."""
kpts = sorted(kpts, key=lambda x: x[key])
num = len(kpts)
for i in range(num - 1, 0, -1):
if kpts[i][key] == kpts[i - 1][key]:
del kpts[i]
return kpts
================================================
FILE: models/datasets/datasets/mp100/test_dataset.py
================================================
import os
import random
from collections import OrderedDict
import numpy as np
from mmpose.datasets import DATASETS
from xtcocotools.coco import COCO
from .test_base_dataset import TestBaseDataset
@DATASETS.register_module()
class TestPoseDataset(TestBaseDataset):
def __init__(self,
ann_file,
img_prefix,
data_cfg,
pipeline,
valid_class_ids,
max_kpt_num=None,
num_shots=1,
num_queries=100,
num_episodes=1,
pck_threshold_list=[0.05, 0.1, 0.15, 0.20, 0.25],
test_mode=True):
super().__init__(
ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode, PCK_threshold_list=pck_threshold_list)
self.ann_info['flip_pairs'] = []
self.ann_info['upper_body_ids'] = []
self.ann_info['lower_body_ids'] = []
self.ann_info['use_different_joint_weights'] = False
self.ann_info['joint_weights'] = np.array([1., ],
dtype=np.float32).reshape((self.ann_info['num_joints'], 1))
self.coco = COCO(ann_file)
self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
self.img_ids = self.coco.getImgIds()
self.classes = [
cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds())
]
self.num_classes = len(self.classes)
self._class_to_ind = dict(zip(self.classes, self.coco.getCatIds()))
self._ind_to_class = dict(zip(self.coco.getCatIds(), self.classes))
if valid_class_ids is not None: # None by default
self.valid_class_ids = valid_class_ids
else:
self.valid_class_ids = self.coco.getCatIds()
self.valid_classes = [self._ind_to_class[ind] for ind in self.valid_class_ids]
self.cats = self.coco.cats
self.max_kpt_num = max_kpt_num
# Also update self.cat2obj
self.db = self._get_db()
self.num_shots = num_shots
if not test_mode:
# Update every training epoch
self.random_paired_samples()
else:
self.num_queries = num_queries
self.num_episodes = num_episodes
self.make_paired_samples()
def random_paired_samples(self):
num_datas = [len(self.cat2obj[self._class_to_ind[cls]]) for cls in self.valid_classes]
# balance the dataset
max_num_data = max(num_datas)
all_samples = []
for cls in self.valid_class_ids:
for i in range(max_num_data):
shot = random.sample(self.cat2obj[cls], self.num_shots + 1)
all_samples.append(shot)
self.paired_samples = np.array(all_samples)
np.random.shuffle(self.paired_samples)
def make_paired_samples(self):
random.seed(1)
np.random.seed(0)
all_samples = []
for cls in self.valid_class_ids:
for _ in range(self.num_episodes):
shots = random.sample(self.cat2obj[cls], self.num_shots + self.num_queries)
sample_ids = shots[:self.num_shots]
query_ids = shots[self.num_shots:]
for query_id in query_ids:
all_samples.append(sample_ids + [query_id])
self.paired_samples = np.array(all_samples)
def _select_kpt(self, obj, kpt_id):
obj['joints_3d'] = obj['joints_3d'][kpt_id:kpt_id + 1]
obj['joints_3d_visible'] = obj['joints_3d_visible'][kpt_id:kpt_id + 1]
obj['kpt_id'] = kpt_id
return obj
@staticmethod
def _get_mapping_id_name(imgs):
"""
Args:
imgs (dict): dict of image info.
Returns:
tuple: Image name & id mapping dicts.
- id2name (dict): Mapping image id to name.
- name2id (dict): Mapping image name to id.
"""
id2name = {}
name2id = {}
for image_id, image in imgs.items():
file_name = image['file_name']
id2name[image_id] = file_name
name2id[file_name] = image_id
return id2name, name2id
def _get_db(self):
"""Ground truth bbox and keypoints."""
self.obj_id = 0
self.cat2obj = {}
for i in self.coco.getCatIds():
self.cat2obj.update({i: []})
gt_db = []
for img_id in self.img_ids:
gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
return gt_db
def _load_coco_keypoint_annotation_kernel(self, img_id):
"""load annotation from COCOAPI.
Note:
bbox:[x1, y1, w, h]
Args:
img_id: coco image id
Returns:
dict: db entry
"""
img_ann = self.coco.loadImgs(img_id)[0]
width = img_ann['width']
height = img_ann['height']
ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
objs = self.coco.loadAnns(ann_ids)
# sanitize bboxes
valid_objs = []
for obj in objs:
if 'bbox' not in obj:
continue
x, y, w, h = obj['bbox']
x1 = max(0, x)
y1 = max(0, y)
x2 = min(width - 1, x1 + max(0, w - 1))
y2 = min(height - 1, y1 + max(0, h - 1))
if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
valid_objs.append(obj)
objs = valid_objs
bbox_id = 0
rec = []
for obj in objs:
if 'keypoints' not in obj:
continue
if max(obj['keypoints']) == 0:
continue
if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
continue
category_id = obj['category_id']
# the number of keypoint for this specific category
cat_kpt_num = int(len(obj['keypoints']) / 3)
if self.max_kpt_num is None:
kpt_num = cat_kpt_num
else:
kpt_num = self.max_kpt_num
joints_3d = np.zeros((kpt_num, 3), dtype=np.float32)
joints_3d_visible = np.zeros((kpt_num, 3), dtype=np.float32)
keypoints = np.array(obj['keypoints']).reshape(-1, 3)
joints_3d[:cat_kpt_num, :2] = keypoints[:, :2]
joints_3d_visible[:cat_kpt_num, :2] = np.minimum(1, keypoints[:, 2:3])
center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
image_file = os.path.join(self.img_prefix, self.id2name[img_id])
self.cat2obj[category_id].append(self.obj_id)
rec.append({
'image_file': image_file,
'center': center,
'scale': scale,
'rotation': 0,
'bbox': obj['clean_bbox'][:4],
'bbox_score': 1,
'joints_3d': joints_3d,
'joints_3d_visible': joints_3d_visible,
'category_id': category_id,
'cat_kpt_num': cat_kpt_num,
'bbox_id': self.obj_id,
'skeleton': self.coco.cats[obj['category_id']]['skeleton'],
})
bbox_id = bbox_id + 1
self.obj_id += 1
return rec
def _xywh2cs(self, x, y, w, h):
"""This encodes bbox(x,y,w,w) into (center, scale)
Args:
x, y, w, h
Returns:
tuple: A tuple containing center and scale.
- center (np.ndarray[float32](2,)): center of the bbox (x, y).
- scale (np.ndarray[float32](2,)): scale of the bbox w & h.
"""
aspect_ratio = self.ann_info['image_size'][0] / self.ann_info['image_size'][1]
center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
#
# if (not self.test_mode) and np.random.rand() < 0.3:
# center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
if w > aspect_ratio * h:
h = w * 1.0 / aspect_ratio
elif w < aspect_ratio * h:
w = h * aspect_ratio
# pixel std is 200.0
scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
# padding to include proper amount of context
scale = scale * 1.25
return center, scale
def evaluate(self, outputs, res_folder, metric='PCK', **kwargs):
"""Evaluate interhand2d keypoint results. The pose prediction results
will be saved in `${res_folder}/result_keypoints.json`.
Note:
batch_size: N
num_keypoints: K
heatmap height: H
heatmap width: W
Args:
outputs (list(preds, boxes, image_path, output_heatmap))
:preds (np.ndarray[N,K,3]): The first two dimensions are
coordinates, score is the third dimension of the array.
:boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
, scale[1],area, score]
:image_paths (list[str]): For example, ['C', 'a', 'p', 't',
'u', 'r', 'e', '1', '2', '/', '0', '3', '9', '0', '_',
'd', 'h', '_', 't', 'o', 'u', 'c', 'h', 'R', 'O', 'M',
'/', 'c', 'a', 'm', '4', '1', '0', '2', '0', '9', '/',
'i', 'm', 'a', 'g', 'e', '6', '2', '4', '3', '4', '.',
'j', 'p', 'g']
:output_heatmap (np.ndarray[N, K, H, W]): model outpus.
res_folder (str): Path of directory to save the results.
metric (str | list[str]): Metric to be performed.
Options: 'PCK', 'AUC', 'EPE'.
Returns:
dict: Evaluation results for evaluation metric.
"""
metrics = metric if isinstance(metric, list) else [metric]
allowed_metrics = ['PCK', 'AUC', 'EPE', 'NME']
for metric in metrics:
if metric not in allowed_metrics:
raise KeyError(f'metric {metric} is not supported')
res_file = os.path.join(res_folder, 'result_keypoints.json')
kpts = []
for output in outputs:
preds = output['preds']
boxes = output['boxes']
image_paths = output['image_paths']
bbox_ids = output['bbox_ids']
batch_size = len(image_paths)
for i in range(batch_size):
image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
kpts.append({
'keypoints': preds[i].tolist(),
'center': boxes[i][0:2].tolist(),
'scale': boxes[i][2:4].tolist(),
'area': float(boxes[i][4]),
'score': float(boxes[i][5]),
'image_id': image_id,
'bbox_id': bbox_ids[i]
})
kpts = self._sort_and_unique_bboxes(kpts)
self._write_keypoint_results(kpts, res_file)
info_str = self._report_metric(res_file, metrics)
name_value = OrderedDict(info_str)
return name_value
================================================
FILE: models/datasets/datasets/mp100/transformer_base_dataset.py
================================================
import copy
from abc import ABCMeta, abstractmethod
import json_tricks as json
import numpy as np
from mmcv.parallel import DataContainer as DC
from mmpose.core.evaluation.top_down_eval import (keypoint_pck_accuracy)
from mmpose.datasets import DATASETS
from mmpose.datasets.pipelines import Compose
from torch.utils.data import Dataset
@DATASETS.register_module()
class TransformerBaseDataset(Dataset, metaclass=ABCMeta):
def __init__(self,
ann_file,
img_prefix,
data_cfg,
pipeline,
test_mode=False):
self.image_info = {}
self.ann_info = {}
self.annotations_path = ann_file
if not img_prefix.endswith('/'):
img_prefix = img_prefix + '/'
self.img_prefix = img_prefix
self.pipeline = pipeline
self.test_mode = test_mode
self.ann_info['image_size'] = np.array(data_cfg['image_size'])
self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
self.ann_info['num_joints'] = data_cfg['num_joints']
self.ann_info['flip_pairs'] = None
self.ann_info['inference_channel'] = data_cfg['inference_channel']
self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
self.db = []
self.num_shots = 1
self.paired_samples = []
self.pipeline = Compose(self.pipeline)
@abstractmethod
def _get_db(self):
"""Load dataset."""
raise NotImplementedError
@abstractmethod
def _select_kpt(self, obj, kpt_id):
"""Select kpt."""
raise NotImplementedError
@abstractmethod
def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
"""Evaluate keypoint results."""
raise NotImplementedError
@staticmethod
def _write_keypoint_results(keypoints, res_file):
"""Write results into a json file."""
with open(res_file, 'w') as f:
json.dump(keypoints, f, sort_keys=True, indent=4)
def _report_metric(self,
res_file,
metrics,
pck_thr=0.2,
pckh_thr=0.7,
auc_nor=30):
"""Keypoint evaluation.
Args:
res_file (str): Json file stored prediction results.
metrics (str | list[str]): Metric to be performed.
Options: 'PCK', 'PCKh', 'AUC', 'EPE'.
pck_thr (float): PCK threshold, default as 0.2.
pckh_thr (float): PCKh threshold, default as 0.7.
auc_nor (float): AUC normalization factor, default as 30 pixel.
Returns:
List: Evaluation results for evaluation metric.
"""
info_str = []
with open(res_file, 'r') as fin:
preds = json.load(fin)
assert len(preds) == len(self.paired_samples)
outputs = []
gts = []
masks = []
threshold_bbox = []
threshold_head_box = []
for pred, pair in zip(preds, self.paired_samples):
item = self.db[pair[-1]]
outputs.append(np.array(pred['keypoints'])[:, :-1])
gts.append(np.array(item['joints_3d'])[:, :-1])
mask_query = ((np.array(item['joints_3d_visible'])[:, 0]) > 0)
mask_sample = ((np.array(self.db[pair[0]]['joints_3d_visible'])[:, 0]) > 0)
for id_s in pair[:-1]:
mask_sample = np.bitwise_and(mask_sample, ((np.array(self.db[id_s]['joints_3d_visible'])[:, 0]) > 0))
masks.append(np.bitwise_and(mask_query, mask_sample))
if 'PCK' in metrics:
bbox = np.array(item['bbox'])
bbox_thr = np.max(bbox[2:])
threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
if 'PCKh' in metrics:
head_box_thr = item['head_size']
threshold_head_box.append(
np.array([head_box_thr, head_box_thr]))
if 'PCK' in metrics:
pck_avg = []
for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox):
_, pck, _ = keypoint_pck_accuracy(np.expand_dims(output, 0), np.expand_dims(gt, 0),
np.expand_dims(mask, 0), pck_thr, np.expand_dims(thr_bbox, 0))
pck_avg.append(pck)
info_str.append(('PCK', np.mean(pck_avg)))
return info_str
def _merge_obj(self, Xs_list, Xq, idx):
""" merge Xs_list and Xq.
:param Xs_list: N-shot samples X
:param Xq: query X
:param idx: id of paired_samples
:return: Xall
"""
Xall = dict()
Xall['img_s'] = [Xs['img'] for Xs in Xs_list]
Xall['target_s'] = [Xs['target'] for Xs in Xs_list]
Xall['target_weight_s'] = [Xs['target_weight'] for Xs in Xs_list]
xs_img_metas = [Xs['img_metas'].data for Xs in Xs_list]
Xall['img_q'] = Xq['img']
Xall['target_q'] = Xq['target']
Xall['target_weight_q'] = Xq['target_weight']
xq_img_metas = Xq['img_metas'].data
img_metas = dict()
for key in xq_img_metas.keys():
img_metas['sample_' + key] = [xs_img_meta[key] for xs_img_meta in xs_img_metas]
img_metas['query_' + key] = xq_img_metas[key]
img_metas['bbox_id'] = idx
Xall['img_metas'] = DC(img_metas, cpu_only=True)
return Xall
def __len__(self):
"""Get the size of the dataset."""
return len(self.paired_samples)
def __getitem__(self, idx):
"""Get the sample given index."""
pair_ids = self.paired_samples[idx] # [supported id * shots, query id]
assert len(pair_ids) == self.num_shots + 1
sample_id_list = pair_ids[:self.num_shots]
query_id = pair_ids[-1]
sample_obj_list = []
for sample_id in sample_id_list:
sample_obj = copy.deepcopy(self.db[sample_id])
sample_obj['ann_info'] = copy.deepcopy(self.ann_info)
sample_obj_list.append(sample_obj)
query_obj = copy.deepcopy(self.db[query_id])
query_obj['ann_info'] = copy.deepcopy(self.ann_info)
Xs_list = []
for sample_obj in sample_obj_list:
Xs = self.pipeline(sample_obj) # dict with ['img', 'target', 'target_weight', 'img_metas'],
Xs_list.append(Xs) # Xs['target'] is of shape [100, map_h, map_w]
Xq = self.pipeline(query_obj)
Xall = self._merge_obj(Xs_list, Xq, idx)
Xall['skeleton'] = self.db[query_id]['skeleton']
return Xall
def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
"""sort kpts and remove the repeated ones."""
kpts = sorted(kpts, key=lambda x: x[key])
num = len(kpts)
for i in range(num - 1, 0, -1):
if kpts[i][key] == kpts[i - 1][key]:
del kpts[i]
return kpts
================================================
FILE: models/datasets/datasets/mp100/transformer_dataset.py
================================================
import os
import random
from collections import OrderedDict
import numpy as np
from mmpose.datasets import DATASETS
from xtcocotools.coco import COCO
from .transformer_base_dataset import TransformerBaseDataset
@DATASETS.register_module()
class TransformerPoseDataset(TransformerBaseDataset):
def __init__(self,
ann_file,
img_prefix,
data_cfg,
pipeline,
valid_class_ids,
max_kpt_num=None,
num_shots=1,
num_queries=100,
num_episodes=1,
test_mode=False):
super().__init__(
ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode)
self.ann_info['flip_pairs'] = []
self.ann_info['upper_body_ids'] = []
self.ann_info['lower_body_ids'] = []
self.ann_info['use_different_joint_weights'] = False
self.ann_info['joint_weights'] = np.array([1., ],
dtype=np.float32).reshape((self.ann_info['num_joints'], 1))
self.coco = COCO(ann_file)
self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
self.img_ids = self.coco.getImgIds()
self.classes = [
cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds())
]
self.num_classes = len(self.classes)
self._class_to_ind = dict(zip(self.classes, self.coco.getCatIds()))
self._ind_to_class = dict(zip(self.coco.getCatIds(), self.classes))
if valid_class_ids is not None: # None by default
self.valid_class_ids = valid_class_ids
else:
self.valid_class_ids = self.coco.getCatIds()
self.valid_classes = [self._ind_to_class[ind] for ind in self.valid_class_ids]
self.cats = self.coco.cats
self.max_kpt_num = max_kpt_num
# Also update self.cat2obj
self.db = self._get_db()
self.num_shots = num_shots
if not test_mode:
# Update every training epoch
self.random_paired_samples()
else:
self.num_queries = num_queries
self.num_episodes = num_episodes
self.make_paired_samples()
def random_paired_samples(self):
num_datas = [len(self.cat2obj[self._class_to_ind[cls]]) for cls in self.valid_classes]
# balance the dataset
max_num_data = max(num_datas)
all_samples = []
for cls in self.valid_class_ids:
for i in range(max_num_data):
shot = random.sample(self.cat2obj[cls], self.num_shots + 1)
all_samples.append(shot)
self.paired_samples = np.array(all_samples)
np.random.shuffle(self.paired_samples)
def make_paired_samples(self):
random.seed(1)
np.random.seed(0)
all_samples = []
for cls in self.valid_class_ids:
for _ in range(self.num_episodes):
shots = random.sample(self.cat2obj[cls], self.num_shots + self.num_queries)
sample_ids = shots[:self.num_shots]
query_ids = shots[self.num_shots:]
for query_id in query_ids:
all_samples.append(sample_ids + [query_id])
self.paired_samples = np.array(all_samples)
def _select_kpt(self, obj, kpt_id):
obj['joints_3d'] = obj['joints_3d'][kpt_id:kpt_id + 1]
obj['joints_3d_visible'] = obj['joints_3d_visible'][kpt_id:kpt_id + 1]
obj['kpt_id'] = kpt_id
return obj
@staticmethod
def _get_mapping_id_name(imgs):
"""
Args:
imgs (dict): dict of image info.
Returns:
tuple: Image name & id mapping dicts.
- id2name (dict): Mapping image id to name.
- name2id (dict): Mapping image name to id.
"""
id2name = {}
name2id = {}
for image_id, image in imgs.items():
file_name = image['file_name']
id2name[image_id] = file_name
name2id[file_name] = image_id
return id2name, name2id
def _get_db(self):
"""Ground truth bbox and keypoints."""
self.obj_id = 0
self.cat2obj = {}
for i in self.coco.getCatIds():
self.cat2obj.update({i: []})
gt_db = []
for img_id in self.img_ids:
gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
return gt_db
def _load_coco_keypoint_annotation_kernel(self, img_id):
"""load annotation from COCOAPI.
Note:
bbox:[x1, y1, w, h]
Args:
img_id: coco image id
Returns:
dict: db entry
"""
img_ann = self.coco.loadImgs(img_id)[0]
width = img_ann['width']
height = img_ann['height']
ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
objs = self.coco.loadAnns(ann_ids)
# sanitize bboxes
valid_objs = []
for obj in objs:
if 'bbox' not in obj:
continue
x, y, w, h = obj['bbox']
x1 = max(0, x)
y1 = max(0, y)
x2 = min(width - 1, x1 + max(0, w - 1))
y2 = min(height - 1, y1 + max(0, h - 1))
if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
valid_objs.append(obj)
objs = valid_objs
bbox_id = 0
rec = []
for obj in objs:
if 'keypoints' not in obj:
continue
if max(obj['keypoints']) == 0:
continue
if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
continue
category_id = obj['category_id']
# the number of keypoint for this specific category
cat_kpt_num = int(len(obj['keypoints']) / 3)
if self.max_kpt_num is None:
kpt_num = cat_kpt_num
else:
kpt_num = self.max_kpt_num
joints_3d = np.zeros((kpt_num, 3), dtype=np.float32)
joints_3d_visible = np.zeros((kpt_num, 3), dtype=np.float32)
keypoints = np.array(obj['keypoints']).reshape(-1, 3)
joints_3d[:cat_kpt_num, :2] = keypoints[:, :2]
joints_3d_visible[:cat_kpt_num, :2] = np.minimum(1, keypoints[:, 2:3])
center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
image_file = os.path.join(self.img_prefix, self.id2name[img_id])
if os.path.exists(image_file):
self.cat2obj[category_id].append(self.obj_id)
rec.append({
'image_file': image_file,
'center': center,
'scale': scale,
'rotation': 0,
'bbox': obj['clean_bbox'][:4],
'bbox_score': 1,
'joints_3d': joints_3d,
'joints_3d_visible': joints_3d_visible,
'category_id': category_id,
'cat_kpt_num': cat_kpt_num,
'bbox_id': self.obj_id,
'skeleton': self.coco.cats[obj['category_id']]['skeleton'],
})
bbox_id = bbox_id + 1
self.obj_id += 1
return rec
def _xywh2cs(self, x, y, w, h):
"""This encodes bbox(x,y,w,w) into (center, scale)
Args:
x, y, w, h
Returns:
tuple: A tuple containing center and scale.
- center (np.ndarray[float32](2,)): center of the bbox (x, y).
- scale (np.ndarray[float32](2,)): scale of the bbox w & h.
"""
aspect_ratio = self.ann_info['image_size'][0] / self.ann_info['image_size'][1]
center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
#
# if (not self.test_mode) and np.random.rand() < 0.3:
# center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
if w > aspect_ratio * h:
h = w * 1.0 / aspect_ratio
elif w < aspect_ratio * h:
w = h * aspect_ratio
# pixel std is 200.0
scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
# padding to include proper amount of context
scale = scale * 1.25
return center, scale
def evaluate(self, outputs, res_folder, metric='PCK', **kwargs):
"""Evaluate interhand2d keypoint results. The pose prediction results
will be saved in `${res_folder}/result_keypoints.json`.
Note:
batch_size: N
num_keypoints: K
heatmap height: H
heatmap width: W
Args:
outputs (list(preds, boxes, image_path, output_heatmap))
:preds (np.ndarray[N,K,3]): The first two dimensions are
coordinates, score is the third dimension of the array.
:boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
, scale[1],area, score]
:image_paths (list[str]): For example, ['C', 'a', 'p', 't',
'u', 'r', 'e', '1', '2', '/', '0', '3', '9', '0', '_',
'd', 'h', '_', 't', 'o', 'u', 'c', 'h', 'R', 'O', 'M',
'/', 'c', 'a', 'm', '4', '1', '0', '2', '0', '9', '/',
'i', 'm', 'a', 'g', 'e', '6', '2', '4', '3', '4', '.',
'j', 'p', 'g']
:output_heatmap (np.ndarray[N, K, H, W]): model outpus.
res_folder (str): Path of directory to save the results.
metric (str | list[str]): Metric to be performed.
Options: 'PCK', 'AUC', 'EPE'.
Returns:
dict: Evaluation results for evaluation metric.
"""
metrics = metric if isinstance(metric, list) else [metric]
allowed_metrics = ['PCK', 'AUC', 'EPE', 'NME']
for metric in metrics:
if metric not in allowed_metrics:
raise KeyError(f'metric {metric} is not supported')
res_file = os.path.join(res_folder, 'result_keypoints.json')
kpts = []
for output in outputs:
preds = output['preds']
boxes = output['boxes']
image_paths = output['image_paths']
bbox_ids = output['bbox_ids']
batch_size = len(image_paths)
for i in range(batch_size):
image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
kpts.append({
'keypoints': preds[i].tolist(),
'center': boxes[i][0:2].tolist(),
'scale': boxes[i][2:4].tolist(),
'area': float(boxes[i][4]),
'score': float(boxes[i][5]),
'image_id': image_id,
'bbox_id': bbox_ids[i]
})
kpts = self._sort_and_unique_bboxes(kpts)
self._write_keypoint_results(kpts, res_file)
info_str = self._report_metric(res_file, metrics)
name_value = OrderedDict(info_str)
return name_value
================================================
FILE: models/datasets/pipelines/__init__.py
================================================
from .top_down_transform import (TopDownAffineFewShot,
TopDownGenerateTargetFewShot)
__all__ = [
'TopDownGenerateTargetFewShot', 'TopDownAffineFewShot'
]
================================================
FILE: models/datasets/pipelines/post_transforms.py
================================================
# ------------------------------------------------------------------------------
# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
# Original licence: Copyright (c) Microsoft, under the MIT License.
# ------------------------------------------------------------------------------
import cv2
import numpy as np
def get_affine_transform(center,
scale,
rot,
output_size,
shift=(0., 0.),
inv=False):
"""Get the affine transform matrix, given the center/scale/rot/output_size.
Args:
center (np.ndarray[2, ]): Center of the bounding box (x, y).
scale (np.ndarray[2, ]): Scale of the bounding box
wrt [width, height].
rot (float): Rotation angle (degree).
output_size (np.ndarray[2, ]): Size of the destination heatmaps.
shift (0-100%): Shift translation ratio wrt the width/height.
Default (0., 0.).
inv (bool): Option to inverse the affine transform direction.
(inv=False: src->dst or inv=True: dst->src)
Returns:
np.ndarray: The transform matrix.
"""
assert len(center) == 2
assert len(scale) == 2
assert len(output_size) == 2
assert len(shift) == 2
# pixel_std is 200.
scale_tmp = scale * 200.0
shift = np.array(shift)
src_w = scale_tmp[0]
dst_w = output_size[0]
dst_h = output_size[1]
rot_rad = np.pi * rot / 180
src_dir = rotate_point([0., src_w * -0.5], rot_rad)
dst_dir = np.array([0., dst_w * -0.5])
src = np.zeros((3, 2), dtype=np.float32)
src[0, :] = center + scale_tmp * shift
src[1, :] = center + src_dir + scale_tmp * shift
src[2, :] = _get_3rd_point(src[0, :], src[1, :])
dst = np.zeros((3, 2), dtype=np.float32)
dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
if inv:
trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
else:
trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
return trans
def affine_transform(pt, trans_mat):
"""Apply an affine transformation to the points.
Args:
pt (np.ndarray): a 2 dimensional point to be transformed
trans_mat (np.ndarray): 2x3 matrix of an affine transform
Returns:
np.ndarray: Transformed points.
"""
assert len(pt) == 2
new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.])
return new_pt
def _get_3rd_point(a, b):
"""To calculate the affine matrix, three pairs of points are required. This
function is used to get the 3rd point, given 2D points a & b.
The 3rd point is defined by rotating vector `a - b` by 90 degrees
anticlockwise, using b as the rotation center.
Args:
a (np.ndarray): point(x,y)
b (np.ndarray): point(x,y)
Returns:
np.ndarray: The 3rd point.
"""
assert len(a) == 2
assert len(b) == 2
direction = a - b
third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
return third_pt
def rotate_point(pt, angle_rad):
"""Rotate a point by an angle.
Args:
pt (list[float]): 2 dimensional point to be rotated
angle_rad (float): rotation angle by radian
Returns:
list[float]: Rotated point.
"""
assert len(pt) == 2
sn, cs = np.sin(angle_rad), np.cos(angle_rad)
new_x = pt[0] * cs - pt[1] * sn
new_y = pt[0] * sn + pt[1] * cs
rotated_pt = [new_x, new_y]
return rotated_pt
================================================
FILE: models/datasets/pipelines/top_down_transform.py
================================================
import cv2
import numpy as np
from mmpose.core.post_processing import (get_warp_matrix,
warp_affine_joints)
from mmpose.datasets.builder import PIPELINES
from .post_transforms import (affine_transform,
get_affine_transform)
@PIPELINES.register_module()
class TopDownAffineFewShot:
"""Affine transform the image to make input.
Required keys:'img', 'joints_3d', 'joints_3d_visible', 'ann_info','scale',
'rotation' and 'center'. Modified keys:'img', 'joints_3d', and
'joints_3d_visible'.
Args:
use_udp (bool): To use unbiased data processing.
Paper ref: Huang et al. The Devil is in the Details: Delving into
Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
"""
def __init__(self, use_udp=False):
self.use_udp = use_udp
def __call__(self, results):
image_size = results['ann_info']['image_size']
img = results['img']
joints_3d = results['joints_3d']
joints_3d_visible = results['joints_3d_visible']
c = results['center']
s = results['scale']
r = results['rotation']
if self.use_udp:
trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0)
img = cv2.warpAffine(
img,
trans, (int(image_size[0]), int(image_size[1])),
flags=cv2.INTER_LINEAR)
joints_3d[:, 0:2] = \
warp_affine_joints(joints_3d[:, 0:2].copy(), trans)
else:
trans = get_affine_transform(c, s, r, image_size)
img = cv2.warpAffine(
img,
trans, (int(image_size[0]), int(image_size[1])),
flags=cv2.INTER_LINEAR)
for i in range(len(joints_3d)):
if joints_3d_visible[i, 0] > 0.0:
joints_3d[i,
0:2] = affine_transform(joints_3d[i, 0:2], trans)
results['img'] = img
results['joints_3d'] = joints_3d
results['joints_3d_visible'] = joints_3d_visible
return results
@PIPELINES.register_module()
class TopDownGenerateTargetFewShot:
"""Generate the target heatmap.
Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'.
Modified keys: 'target', and 'target_weight'.
Args:
sigma: Sigma of heatmap gaussian for 'MSRA' approach.
kernel: Kernel of heatmap gaussian for 'Megvii' approach.
encoding (str): Approach to generate target heatmaps.
Currently supported approaches: 'MSRA', 'Megvii', 'UDP'.
Default:'MSRA'
unbiased_encoding (bool): Option to use unbiased
encoding methods.
Paper ref: Zhang et al. Distribution-Aware Coordinate
Representation for Human Pose Estimation (CVPR 2020).
keypoint_pose_distance: Keypoint pose distance for UDP.
Paper ref: Huang et al. The Devil is in the Details: Delving into
Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
target_type (str): supported targets: 'GaussianHeatMap',
'CombinedTarget'. Default:'GaussianHeatMap'
CombinedTarget: The combination of classification target
(response map) and regression target (offset map).
Paper ref: Huang et al. The Devil is in the Details: Delving into
Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
"""
def __init__(self,
sigma=2,
kernel=(11, 11),
valid_radius_factor=0.0546875,
target_type='GaussianHeatMap',
encoding='MSRA',
unbiased_encoding=False):
self.sigma = sigma
self.unbiased_encoding = unbiased_encoding
self.kernel = kernel
self.valid_radius_factor = valid_radius_factor
self.target_type = target_type
self.encoding = encoding
def _msra_generate_target(self, cfg, joints_3d, joints_3d_visible, sigma):
"""Generate the target heatmap via "MSRA" approach.
Args:
cfg (dict): data config
joints_3d: np.ndarray ([num_joints, 3])
joints_3d_visible: np.ndarray ([num_joints, 3])
sigma: Sigma of heatmap gaussian
Returns:
tuple: A tuple containing targets.
- target: Target heatmaps.
- target_weight: (1: visible, 0: invisible)
"""
num_joints = len(joints_3d)
image_size = cfg['image_size']
W, H = cfg['heatmap_size']
joint_weights = cfg['joint_weights']
use_different_joint_weights = cfg['use_different_joint_weights']
assert not use_different_joint_weights
target_weight = np.zeros((num_joints, 1), dtype=np.float32)
target = np.zeros((num_joints, H, W), dtype=np.float32)
# 3-sigma rule
tmp_size = sigma * 3
if self.unbiased_encoding:
for joint_id in range(num_joints):
target_weight[joint_id] = joints_3d_visible[joint_id, 0]
feat_stride = image_size / [W, H]
mu_x = joints_3d[joint_id][0] / feat_stride[0]
mu_y = joints_3d[joint_id][1] / feat_stride[1]
# Check that any part of the gaussian is in-bounds
ul = [mu_x - tmp_size, mu_y - tmp_size]
br = [mu_x + tmp_size + 1, mu_y + tmp_size + 1]
if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0:
target_weight[joint_id] = 0
if target_weight[joint_id] == 0:
continue
x = np.arange(0, W, 1, np.float32)
y = np.arange(0, H, 1, np.float32)
y = y[:, None]
if target_weight[joint_id] > 0.5:
target[joint_id] = np.exp(-((x - mu_x) ** 2 +
(y - mu_y) ** 2) /
(2 * sigma ** 2))
else:
for joint_id in range(num_joints):
target_weight[joint_id] = joints_3d_visible[joint_id, 0]
feat_stride = image_size / [W, H]
mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5)
mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5)
# Check that any part of the gaussian is in-bounds
ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0:
target_weight[joint_id] = 0
if target_weight[joint_id] > 0.5:
size = 2 * tmp_size + 1
x = np.arange(0, size, 1, np.float32)
y = x[:, None]
x0 = y0 = size // 2
# The gaussian is not normalized,
# we want the center value to equal 1
g = np.exp(-((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2))
# Usable gaussian range
g_x = max(0, -ul[0]), min(br[0], W) - ul[0]
g_y = max(0, -ul[1]), min(br[1], H) - ul[1]
# Image range
img_x = max(0, ul[0]), min(br[0], W)
img_y = max(0, ul[1]), min(br[1], H)
target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
if use_different_joint_weights:
target_weight = np.multiply(target_weight, joint_weights)
return target, target_weight
def _udp_generate_target(self, cfg, joints_3d, joints_3d_visible, factor,
target_type):
"""Generate the target heatmap via 'UDP' approach. Paper ref: Huang et
al. The Devil is in the Details: Delving into Unbiased Data Processing
for Human Pose Estimation (CVPR 2020).
Note:
num keypoints: K
heatmap height: H
heatmap width: W
num target channels: C
C = K if target_type=='GaussianHeatMap'
C = 3*K if target_type=='CombinedTarget'
Args:
cfg (dict): data config
joints_3d (np.ndarray[K, 3]): Annotated keypoints.
joints_3d_visible (np.ndarray[K, 3]): Visibility of keypoints.
factor (float): kernel factor for GaussianHeatMap target or
valid radius factor for CombinedTarget.
target_type (str): 'GaussianHeatMap' or 'CombinedTarget'.
GaussianHeatMap: Heatmap target with gaussian distribution.
CombinedTarget: The combination of classification target
(response map) and regression target (offset map).
Returns:
tuple: A tuple containing targets.
- target (np.ndarray[C, H, W]): Target heatmaps.
- target_weight (np.ndarray[K, 1]): (1: visible, 0: invisible)
"""
num_joints = len(joints_3d)
image_size = cfg['image_size']
heatmap_size = cfg['heatmap_size']
joint_weights = cfg['joint_weights']
use_different_joint_weights = cfg['use_different_joint_weights']
assert not use_different_joint_weights
target_weight = np.ones((num_joints, 1), dtype=np.float32)
target_weight[:, 0] = joints_3d_visible[:, 0]
assert target_type in ['GaussianHeatMap', 'CombinedTarget']
if target_type == 'GaussianHeatMap':
target = np.zeros((num_joints, heatmap_size[1], heatmap_size[0]),
dtype=np.float32)
tmp_size = factor * 3
# prepare for gaussian
size = 2 * tmp_size + 1
x = np.arange(0, size, 1, np.float32)
y = x[:, None]
for joint_id in range(num_joints):
feat_stride = (image_size - 1.0) / (heatmap_size - 1.0)
mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5)
mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5)
# Check that any part of the gaussian is in-bounds
ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
if ul[0] >= heatmap_size[0] or ul[1] >= heatmap_size[1] \
or br[0] < 0 or br[1] < 0:
# If not, just return the image as is
target_weight[joint_id] = 0
continue
# # Generate gaussian
mu_x_ac = joints_3d[joint_id][0] / feat_stride[0]
mu_y_ac = joints_3d[joint_id][1] / feat_stride[1]
x0 = y0 = size // 2
x0 += mu_x_ac - mu_x
y0 += mu_y_ac - mu_y
g = np.exp(-((x - x0) ** 2 + (y - y0) ** 2) / (2 * factor ** 2))
# Usable gaussian range
g_x = max(0, -ul[0]), min(br[0], heatmap_size[0]) - ul[0]
g_y = max(0, -ul[1]), min(br[1], heatmap_size[1]) - ul[1]
# Image range
img_x = max(0, ul[0]), min(br[0], heatmap_size[0])
img_y = max(0, ul[1]), min(br[1], heatmap_size[1])
v = target_weight[joint_id]
if v > 0.5:
target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
elif target_type == 'CombinedTarget':
target = np.zeros(
(num_joints, 3, heatmap_size[1] * heatmap_size[0]),
dtype=np.float32)
feat_width = heatmap_size[0]
feat_height = heatmap_size[1]
feat_x_int = np.arange(0, feat_width)
feat_y_int = np.arange(0, feat_height)
feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int)
feat_x_int = feat_x_int.flatten()
feat_y_int = feat_y_int.flatten()
# Calculate the radius of the positive area in classification
# heatmap.
valid_radius = factor * heatmap_size[1]
feat_stride = (image_size - 1.0) / (heatmap_size - 1.0)
for joint_id in range(num_joints):
mu_x = joints_3d[joint_id][0] / feat_stride[0]
mu_y = joints_3d[joint_id][1] / feat_stride[1]
x_offset = (mu_x - feat_x_int) / valid_radius
y_offset = (mu_y - feat_y_int) / valid_radius
dis = x_offset ** 2 + y_offset ** 2
keep_pos = np.where(dis <= 1)[0]
v = target_weight[joint_id]
if v > 0.5:
target[joint_id, 0, keep_pos] = 1
target[joint_id, 1, keep_pos] = x_offset[keep_pos]
target[joint_id, 2, keep_pos] = y_offset[keep_pos]
target = target.reshape(num_joints * 3, heatmap_size[1],
heatmap_size[0])
if use_different_joint_weights:
target_weight = np.multiply(target_weight, joint_weights)
return target, target_weight
def __call__(self, results):
"""Generate the target heatmap."""
joints_3d = results['joints_3d']
joints_3d_visible = results['joints_3d_visible']
assert self.encoding in ['MSRA', 'UDP']
if self.encoding == 'MSRA':
if isinstance(self.sigma, list):
num_sigmas = len(self.sigma)
cfg = results['ann_info']
num_joints = len(joints_3d)
heatmap_size = cfg['heatmap_size']
target = np.empty(
(0, num_joints, heatmap_size[1], heatmap_size[0]),
dtype=np.float32)
target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
for i in range(num_sigmas):
target_i, target_weight_i = self._msra_generate_target(
cfg, joints_3d, joints_3d_visible, self.sigma[i])
target = np.concatenate([target, target_i[None]], axis=0)
target_weight = np.concatenate(
[target_weight, target_weight_i[None]], axis=0)
else:
target, target_weight = self._msra_generate_target(
results['ann_info'], joints_3d, joints_3d_visible,
self.sigma)
elif self.encoding == 'UDP':
if self.target_type == 'CombinedTarget':
factors = self.valid_radius_factor
channel_factor = 3
elif self.target_type == 'GaussianHeatMap':
factors = self.sigma
channel_factor = 1
if isinstance(factors, list):
num_factors = len(factors)
cfg = results['ann_info']
num_joints = len(joints_3d)
W, H = cfg['heatmap_size']
target = np.empty((0, channel_factor * num_joints, H, W),
dtype=np.float32)
target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
for i in range(num_factors):
target_i, target_weight_i = self._udp_generate_target(
cfg, joints_3d, joints_3d_visible, factors[i],
self.target_type)
target = np.concatenate([target, target_i[None]], axis=0)
target_weight = np.concatenate(
[target_weight, target_weight_i[None]], axis=0)
else:
target, target_weight = self._udp_generate_target(
results['ann_info'], joints_3d, joints_3d_visible, factors,
self.target_type)
else:
raise ValueError(
f'Encoding approach {self.encoding} is not supported!')
results['target'] = target
results['target_weight'] = target_weight
return results
================================================
FILE: models/models/__init__.py
================================================
from .backbones import * # noqa
from .detectors import * # noqa
from .keypoint_heads import * # noqa
================================================
FILE: models/models/backbones/__init__.py
================================================
from .swin_transformer_v2 import SwinTransformerV2
================================================
FILE: models/models/backbones/simmim.py
================================================
# --------------------------------------------------------
# SimMIM
# Copyright (c) 2021 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Zhenda Xie
# --------------------------------------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
from timm.models.layers import trunc_normal_
from .swin_transformer import SwinTransformer
from .swin_transformer_v2 import SwinTransformerV2
def norm_targets(targets, patch_size):
assert patch_size % 2 == 1
targets_ = targets
targets_count = torch.ones_like(targets)
targets_square = targets ** 2.
targets_mean = F.avg_pool2d(targets, kernel_size=patch_size, stride=1, padding=patch_size // 2,
count_include_pad=False)
targets_square_mean = F.avg_pool2d(targets_square, kernel_size=patch_size, stride=1, padding=patch_size // 2,
count_include_pad=False)
targets_count = F.avg_pool2d(targets_count, kernel_size=patch_size, stride=1, padding=patch_size // 2,
count_include_pad=True) * (patch_size ** 2)
targets_var = (targets_square_mean - targets_mean ** 2.) * (targets_count / (targets_count - 1))
targets_var = torch.clamp(targets_var, min=0.)
targets_ = (targets_ - targets_mean) / (targets_var + 1.e-6) ** 0.5
return targets_
class SwinTransformerForSimMIM(SwinTransformer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
assert self.num_classes == 0
self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
trunc_normal_(self.mask_token, mean=0., std=.02)
def forward(self, x, mask):
x = self.patch_embed(x)
assert mask is not None
B, L, _ = x.shape
mask_tokens = self.mask_token.expand(B, L, -1)
w = mask.flatten(1).unsqueeze(-1).type_as(mask_tokens)
x = x * (1. - w) + mask_tokens * w
if self.ape:
x = x + self.absolute_pos_embed
x = self.pos_drop(x)
for layer in self.layers:
x = layer(x)
x = self.norm(x)
x = x.transpose(1, 2)
B, C, L = x.shape
H = W = int(L ** 0.5)
x = x.reshape(B, C, H, W)
return x
@torch.jit.ignore
def no_weight_decay(self):
return super().no_weight_decay() | {'mask_token'}
class SwinTransformerV2ForSimMIM(SwinTransformerV2):
def __init__(self, **kwargs):
super().__init__(**kwargs)
assert self.num_classes == 0
self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
trunc_normal_(self.mask_token, mean=0., std=.02)
def forward(self, x, mask):
x = self.patch_embed(x)
assert mask is not None
B, L, _ = x.shape
mask_tokens = self.mask_token.expand(B, L, -1)
w = mask.flatten(1).unsqueeze(-1).type_as(mask_tokens)
x = x * (1. - w) + mask_tokens * w
if self.ape:
x = x + self.absolute_pos_embed
x = self.pos_drop(x)
for layer in self.layers:
x = layer(x)
x = self.norm(x)
x = x.transpose(1, 2)
B, C, L = x.shape
H = W = int(L ** 0.5)
x = x.reshape(B, C, H, W)
return x
@torch.jit.ignore
def no_weight_decay(self):
return super().no_weight_decay() | {'mask_token'}
class SimMIM(nn.Module):
def __init__(self, config, encoder, encoder_stride, in_chans, patch_size):
super().__init__()
self.config = config
self.encoder = encoder
self.encoder_stride = encoder_stride
self.decoder = nn.Sequential(
nn.Conv2d(
in_channels=self.encoder.num_features,
out_channels=self.encoder_stride ** 2 * 3, kernel_size=1),
nn.PixelShuffle(self.encoder_stride),
)
self.in_chans = in_chans
self.patch_size = patch_size
def forward(self, x, mask):
z = self.encoder(x, mask)
x_rec = self.decoder(z)
mask = mask.repeat_interleave(self.patch_size, 1).repeat_interleave(self.patch_size, 2).unsqueeze(
1).contiguous()
# norm target as prompted
if self.config.NORM_TARGET.ENABLE:
x = norm_targets(x, self.config.NORM_TARGET.PATCH_SIZE)
loss_recon = F.l1_loss(x, x_rec, reduction='none')
loss = (loss_recon * mask).sum() / (mask.sum() + 1e-5) / self.in_chans
return loss
@torch.jit.ignore
def no_weight_decay(self):
if hasattr(self.encoder, 'no_weight_decay'):
return {'encoder.' + i for i in self.encoder.no_weight_decay()}
return {}
@torch.jit.ignore
def no_weight_decay_keywords(self):
if hasattr(self.encoder, 'no_weight_decay_keywords'):
return {'encoder.' + i for i in self.encoder.no_weight_decay_keywords()}
return {}
def build_simmim(config):
model_type = config.MODEL.TYPE
if model_type == 'swin':
encoder = SwinTransformerForSimMIM(
img_size=config.DATA.IMG_SIZE,
patch_size=config.MODEL.SWIN.PATCH_SIZE,
in_chans=config.MODEL.SWIN.IN_CHANS,
num_classes=0,
embed_dim=config.MODEL.SWIN.EMBED_DIM,
depths=config.MODEL.SWIN.DEPTHS,
num_heads=config.MODEL.SWIN.NUM_HEADS,
window_size=config.MODEL.SWIN.WINDOW_SIZE,
mlp_ratio=config.MODEL.SWIN.MLP_RATIO,
qkv_bias=config.MODEL.SWIN.QKV_BIAS,
qk_scale=config.MODEL.SWIN.QK_SCALE,
drop_rate=config.MODEL.DROP_RATE,
drop_path_rate=config.MODEL.DROP_PATH_RATE,
ape=config.MODEL.SWIN.APE,
patch_norm=config.MODEL.SWIN.PATCH_NORM,
use_checkpoint=config.TRAIN.USE_CHECKPOINT)
encoder_stride = 32
in_chans = config.MODEL.SWIN.IN_CHANS
patch_size = config.MODEL.SWIN.PATCH_SIZE
elif model_type == 'swinv2':
encoder = SwinTransformerV2ForSimMIM(
img_size=config.DATA.IMG_SIZE,
patch_size=config.MODEL.SWINV2.PATCH_SIZE,
in_chans=config.MODEL.SWINV2.IN_CHANS,
num_classes=0,
embed_dim=config.MODEL.SWINV2.EMBED_DIM,
depths=config.MODEL.SWINV2.DEPTHS,
num_heads=config.MODEL.SWINV2.NUM_HEADS,
window_size=config.MODEL.SWINV2.WINDOW_SIZE,
mlp_ratio=config.MODEL.SWINV2.MLP_RATIO,
qkv_bias=config.MODEL.SWINV2.QKV_BIAS,
drop_rate=config.MODEL.DROP_RATE,
drop_path_rate=config.MODEL.DROP_PATH_RATE,
ape=config.MODEL.SWINV2.APE,
patch_norm=config.MODEL.SWINV2.PATCH_NORM,
use_checkpoint=config.TRAIN.USE_CHECKPOINT)
encoder_stride = 32
in_chans = config.MODEL.SWINV2.IN_CHANS
patch_size = config.MODEL.SWINV2.PATCH_SIZE
else:
raise NotImplementedError(f"Unknown pre-train model: {model_type}")
model = SimMIM(config=config.MODEL.SIMMIM, encoder=encoder, encoder_stride=encoder_stride, in_chans=in_chans,
patch_size=patch_size)
return model
================================================
FILE: models/models/backbones/swin_mlp.py
================================================
# --------------------------------------------------------
# Swin Transformer
# Copyright (c) 2021 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ze Liu
# --------------------------------------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size
Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows
def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image
Returns:
x: (B, H, W, C)
"""
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
return x
class SwinMLPBlock(nn.Module):
r""" Swin MLP Block.
Args:
dim (int): Number of input channels.
input_resolution (tuple[int]): Input resulotion.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
drop (float, optional): Dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
mlp_ratio=4., drop=0., drop_path=0.,
act_layer=nn.GELU, norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
if min(self.input_resolution) <= self.window_size:
# if window size is larger than input resolution, we don't partition windows
self.shift_size = 0
self.window_size = min(self.input_resolution)
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
self.padding = [self.window_size - self.shift_size, self.shift_size,
self.window_size - self.shift_size, self.shift_size] # P_l,P_r,P_t,P_b
self.norm1 = norm_layer(dim)
# use group convolution to implement multi-head MLP
self.spatial_mlp = nn.Conv1d(self.num_heads * self.window_size ** 2,
self.num_heads * self.window_size ** 2,
kernel_size=1,
groups=self.num_heads)
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def forward(self, x):
H, W = self.input_resolution
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
shortcut = x
x = self.norm1(x)
x = x.view(B, H, W, C)
# shift
if self.shift_size > 0:
P_l, P_r, P_t, P_b = self.padding
shifted_x = F.pad(x, [0, 0, P_l, P_r, P_t, P_b], "constant", 0)
else:
shifted_x = x
_, _H, _W, _ = shifted_x.shape
# partition windows
x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
# Window/Shifted-Window Spatial MLP
x_windows_heads = x_windows.view(-1, self.window_size * self.window_size, self.num_heads, C // self.num_heads)
x_windows_heads = x_windows_heads.transpose(1, 2) # nW*B, nH, window_size*window_size, C//nH
x_windows_heads = x_windows_heads.reshape(-1, self.num_heads * self.window_size * self.window_size,
C // self.num_heads)
spatial_mlp_windows = self.spatial_mlp(x_windows_heads) # nW*B, nH*window_size*window_size, C//nH
spatial_mlp_windows = spatial_mlp_windows.view(-1, self.num_heads, self.window_size * self.window_size,
C // self.num_heads).transpose(1, 2)
spatial_mlp_windows = spatial_mlp_windows.reshape(-1, self.window_size * self.window_size, C)
# merge windows
spatial_mlp_windows = spatial_mlp_windows.reshape(-1, self.window_size, self.window_size, C)
shifted_x = window_reverse(spatial_mlp_windows, self.window_size, _H, _W) # B H' W' C
# reverse shift
if self.shift_size > 0:
P_l, P_r, P_t, P_b = self.padding
x = shifted_x[:, P_t:-P_b, P_l:-P_r, :].contiguous()
else:
x = shifted_x
x = x.view(B, H * W, C)
# FFN
x = shortcut + self.drop_path(x)
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
def extra_repr(self) -> str:
return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
def flops(self):
flops = 0
H, W = self.input_resolution
# norm1
flops += self.dim * H * W
# Window/Shifted-Window Spatial MLP
if self.shift_size > 0:
nW = (H / self.window_size + 1) * (W / self.window_size + 1)
else:
nW = H * W / self.window_size / self.window_size
flops += nW * self.dim * (self.window_size * self.window_size) * (self.window_size * self.window_size)
# mlp
flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
# norm2
flops += self.dim * H * W
return flops
class PatchMerging(nn.Module):
r""" Patch Merging Layer.
Args:
input_resolution (tuple[int]): Resolution of input feature.
dim (int): Number of input channels.
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
super().__init__()
self.input_resolution = input_resolution
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
self.norm = norm_layer(4 * dim)
def forward(self, x):
"""
x: B, H*W, C
"""
H, W = self.input_resolution
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
x = x.view(B, H, W, C)
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
x = self.norm(x)
x = self.reduction(x)
return x
def extra_repr(self) -> str:
return f"input_resolution={self.input_resolution}, dim={self.dim}"
def flops(self):
H, W = self.input_resolution
flops = H * W * self.dim
flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
return flops
class BasicLayer(nn.Module):
""" A basic Swin MLP layer for one stage.
Args:
dim (int): Number of input channels.
input_resolution (tuple[int]): Input resolution.
depth (int): Number of blocks.
num_heads (int): Number of attention heads.
window_size (int): Local window size.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
drop (float, optional): Dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""
def __init__(self, dim, input_resolution, depth, num_heads, window_size,
mlp_ratio=4., drop=0., drop_path=0.,
norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.depth = depth
self.use_checkpoint = use_checkpoint
# build blocks
self.blocks = nn.ModuleList([
SwinMLPBlock(dim=dim, input_resolution=input_resolution,
num_heads=num_heads, window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
drop=drop,
drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
norm_layer=norm_layer)
for i in range(depth)])
# patch merging layer
if downsample is not None:
self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
else:
self.downsample = None
def forward(self, x):
for blk in self.blocks:
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x)
else:
x = blk(x)
if self.downsample is not None:
x = self.downsample(x)
return x
def extra_repr(self) -> str:
return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
def flops(self):
flops = 0
for blk in self.blocks:
flops += blk.flops()
if self.downsample is not None:
flops += self.downsample.flops()
return flops
class PatchEmbed(nn.Module):
r""" Image to Patch Embedding
Args:
img_size (int): Image size. Default: 224.
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Module, optional): Normalization layer. Default: None
"""
def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
self.img_size = img_size
self.patch_size = patch_size
self.patches_resolution = patches_resolution
self.num_patches = patches_resolution[0] * patches_resolution[1]
self.in_chans = in_chans
self.embed_dim = embed_dim
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
B, C, H, W = x.shape
# FIXME look at relaxing size constraints
assert H == self.img_size[0] and W == self.img_size[1], \
f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C
if self.norm is not None:
x = self.norm(x)
return x
def flops(self):
Ho, Wo = self.patches_resolution
flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
if self.norm is not None:
flops += Ho * Wo * self.embed_dim
return flops
class SwinMLP(nn.Module):
r""" Swin MLP
Args:
img_size (int | tuple(int)): Input image size. Default 224
patch_size (int | tuple(int)): Patch size. Default: 4
in_chans (int): Number of input image channels. Default: 3
num_classes (int): Number of classes for classification head. Default: 1000
embed_dim (int): Patch embedding dimension. Default: 96
depths (tuple(int)): Depth of each Swin MLP layer.
num_heads (tuple(int)): Number of attention heads in different layers.
window_size (int): Window size. Default: 7
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
drop_rate (float): Dropout rate. Default: 0
drop_path_rate (float): Stochastic depth rate. Default: 0.1
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
patch_norm (bool): If True, add normalization after patch embedding. Default: True
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
"""
def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
window_size=7, mlp_ratio=4., drop_rate=0., drop_path_rate=0.1,
norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
use_checkpoint=False, **kwargs):
super().__init__()
self.num_classes = num_classes
self.num_layers = len(depths)
self.embed_dim = embed_dim
self.ape = ape
self.patch_norm = patch_norm
self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
self.mlp_ratio = mlp_ratio
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None)
num_patches = self.patch_embed.num_patches
patches_resolution = self.patch_embed.patches_resolution
self.patches_resolution = patches_resolution
# absolute position embedding
if self.ape:
self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
trunc_normal_(self.absolute_pos_embed, std=.02)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
# build layers
self.layers = nn.ModuleList()
for i_layer in range(self.num_layers):
layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
input_resolution=(patches_resolution[0] // (2 ** i_layer),
patches_resolution[1] // (2 ** i_layer)),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=self.mlp_ratio,
drop=drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
use_checkpoint=use_checkpoint)
self.layers.append(layer)
self.norm = norm_layer(self.num_features)
self.avgpool = nn.AdaptiveAvgPool1d(1)
self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, (nn.Linear, nn.Conv1d)):
trunc_normal_(m.weight, std=.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
@torch.jit.ignore
def no_weight_decay(self):
return {'absolute_pos_embed'}
@torch.jit.ignore
def no_weight_decay_keywords(self):
return {'relative_position_bias_table'}
def forward_features(self, x):
x = self.patch_embed(x)
if self.ape:
x = x + self.absolute_pos_embed
x = self.pos_drop(x)
for layer in self.layers:
x = layer(x)
x = self.norm(x) # B L C
x = self.avgpool(x.transpose(1, 2)) # B C 1
x = torch.flatten(x, 1)
return x
def forward(self, x):
x = self.forward_features(x)
x = self.head(x)
return x
def flops(self):
flops = 0
flops += self.patch_embed.flops()
for i, layer in enumerate(self.layers):
flops += layer.flops()
flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
flops += self.num_features * self.num_classes
return flops
================================================
FILE: models/models/backbones/swin_transformer.py
================================================
# --------------------------------------------------------
# Swin Transformer
# Copyright (c) 2021 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ze Liu
# --------------------------------------------------------
import torch
import torch.nn as nn
import torch.utils.checkpoint as checkpoint
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
try:
import os, sys
kernel_path = os.path.abspath(os.path.join('..'))
sys.path.append(kernel_path)
from kernels.window_process.window_process import WindowProcess, WindowProcessReverse
except:
WindowProcess = None
WindowProcessReverse = None
print("[Warning] Fused window process have not been installed. Please refer to get_started.md for installation.")
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size
Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows
def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image
Returns:
x: (B, H, W, C)
"""
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
return x
class WindowAttention(nn.Module):
r""" Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.
Args:
dim (int): Number of input channels.
window_size (tuple[int]): The height and width of the window.
num_heads (int): Number of attention heads.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
"""
def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim ** -0.5
# define a parameter table of relative position bias
self.relative_position_bias_table = nn.Parameter(
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH
# get pair-wise relative position index for each token inside the window
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.register_buffer("relative_position_index", relative_position_index)
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
trunc_normal_(self.relative_position_bias_table, std=.02)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x, mask=None):
"""
Args:
x: input features with shape of (num_windows*B, N, C)
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
"""
B_, N, C = x.shape
qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
q = q * self.scale
attn = (q @ k.transpose(-2, -1))
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if mask is not None:
nW = mask.shape[0]
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.view(-1, self.num_heads, N, N)
attn = self.softmax(attn)
else:
attn = self.softmax(attn)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
def extra_repr(self) -> str:
return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
def flops(self, N):
# calculate flops for 1 window with token length of N
flops = 0
# qkv = self.qkv(x)
flops += N * self.dim * 3 * self.dim
# attn = (q @ k.transpose(-2, -1))
flops += self.num_heads * N * (self.dim // self.num_heads) * N
# x = (attn @ v)
flops += self.num_heads * N * N * (self.dim // self.num_heads)
# x = self.proj(x)
flops += N * self.dim * self.dim
return flops
class SwinTransformerBlock(nn.Module):
r""" Swin Transformer Block.
Args:
dim (int): Number of input channels.
input_resolution (tuple[int]): Input resulotion.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
fused_window_process (bool, optional): If True, use one kernel to fused window shift & window partition for acceleration, similar for the reversed part. Default: False
"""
def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
act_layer=nn.GELU, norm_layer=nn.LayerNorm,
fused_window_process=False):
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
if min(self.input_resolution) <= self.window_size:
# if window size is larger than input resolution, we don't partition windows
self.shift_size = 0
self.window_size = min(self.input_resolution)
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
self.norm1 = norm_layer(dim)
self.attn = WindowAttention(
dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
if self.shift_size > 0:
# calculate attention mask for SW-MSA
H, W = self.input_resolution
img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1
h_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
w_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1
mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
else:
attn_mask = None
self.register_buffer("attn_mask", attn_mask)
self.fused_window_process = fused_window_process
def forward(self, x):
H, W = self.input_resolution
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
shortcut = x
x = self.norm1(x)
x = x.view(B, H, W, C)
# cyclic shift
if self.shift_size > 0:
if not self.fused_window_process:
shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
# partition windows
x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
else:
x_windows = WindowProcess.apply(x, B, H, W, C, -self.shift_size, self.window_size)
else:
shifted_x = x
# partition windows
x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
# W-MSA/SW-MSA
attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C
# merge windows
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
# reverse cyclic shift
if self.shift_size > 0:
if not self.fused_window_process:
shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C
x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
else:
x = WindowProcessReverse.apply(attn_windows, B, H, W, C, self.shift_size, self.window_size)
else:
shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C
x = shifted_x
x = x.view(B, H * W, C)
x = shortcut + self.drop_path(x)
# FFN
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
def extra_repr(self) -> str:
return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
def flops(self):
flops = 0
H, W = self.input_resolution
# norm1
flops += self.dim * H * W
# W-MSA/SW-MSA
nW = H * W / self.window_size / self.window_size
flops += nW * self.attn.flops(self.window_size * self.window_size)
# mlp
flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
# norm2
flops += self.dim * H * W
return flops
class PatchMerging(nn.Module):
r""" Patch Merging Layer.
Args:
input_resolution (tuple[int]): Resolution of input feature.
dim (int): Number of input channels.
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
super().__init__()
self.input_resolution = input_resolution
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
self.norm = norm_layer(4 * dim)
def forward(self, x):
"""
x: B, H*W, C
"""
H, W = self.input_resolution
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
x = x.view(B, H, W, C)
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
x = self.norm(x)
x = self.reduction(x)
return x
def extra_repr(self) -> str:
return f"input_resolution={self.input_resolution}, dim={self.dim}"
def flops(self):
H, W = self.input_resolution
flops = H * W * self.dim
flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
return flops
class BasicLayer(nn.Module):
""" A basic Swin Transformer layer for one stage.
Args:
dim (int): Number of input channels.
input_resolution (tuple[int]): Input resolution.
depth (int): Number of blocks.
num_heads (int): Number of attention heads.
window_size (int): Local window size.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
fused_window_process (bool, optional): If True, use one kernel to fused window shift & window partition for acceleration, similar for the reversed part. Default: False
"""
def __init__(self, dim, input_resolution, depth, num_heads, window_size,
mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
fused_window_process=False):
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.depth = depth
self.use_checkpoint = use_checkpoint
# build blocks
self.blocks = nn.ModuleList([
SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
num_heads=num_heads, window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop, attn_drop=attn_drop,
drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
norm_layer=norm_layer,
fused_window_process=fused_window_process)
for i in range(depth)])
# patch merging layer
if downsample is not None:
self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
else:
self.downsample = None
def forward(self, x):
for blk in self.blocks:
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x)
else:
x = blk(x)
if self.downsample is not None:
x = self.downsample(x)
return x
def extra_repr(self) -> str:
return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
def flops(self):
flops = 0
for blk in self.blocks:
flops += blk.flops()
if self.downsample is not None:
flops += self.downsample.flops()
return flops
class PatchEmbed(nn.Module):
r""" Image to Patch Embedding
Args:
img_size (int): Image size. Default: 224.
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Module, optional): Normalization layer. Default: None
"""
def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
self.img_size = img_size
self.patch_size = patch_size
self.patches_resolution = patches_resolution
self.num_patches = patches_resolution[0] * patches_resolution[1]
self.in_chans = in_chans
self.embed_dim = embed_dim
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
B, C, H, W = x.shape
# FIXME look at relaxing size constraints
assert H == self.img_size[0] and W == self.img_size[1], \
f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C
if self.norm is not None:
x = self.norm(x)
return x
def flops(self):
Ho, Wo = self.patches_resolution
flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
if self.norm is not None:
flops += Ho * Wo * self.embed_dim
return flops
class SwinTransformer(nn.Module):
r""" Swin Transformer
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
https://arxiv.org/pdf/2103.14030
Args:
img_size (int | tuple(int)): Input image size. Default 224
patch_size (int | tuple(int)): Patch size. Default: 4
in_chans (int): Number of input image channels. Default: 3
num_classes (int): Number of classes for classification head. Default: 1000
embed_dim (int): Patch embedding dimension. Default: 96
depths (tuple(int)): Depth of each Swin Transformer layer.
num_heads (tuple(int)): Number of attention heads in different layers.
window_size (int): Window size. Default: 7
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
drop_rate (float): Dropout rate. Default: 0
attn_drop_rate (float): Attention dropout rate. Default: 0
drop_path_rate (float): Stochastic depth rate. Default: 0.1
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
patch_norm (bool): If True, add normalization after patch embedding. Default: True
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
fused_window_process (bool, optional): If True, use one kernel to fused window shift & window partition for acceleration, similar for the reversed part. Default: False
"""
def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
use_checkpoint=False, fused_window_process=False, **kwargs):
super().__init__()
self.num_classes = num_classes
self.num_layers = len(depths)
self.embed_dim = embed_dim
self.ape = ape
self.patch_norm = patch_norm
self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
self.mlp_ratio = mlp_ratio
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None)
num_patches = self.patch_embed.num_patches
patches_resolution = self.patch_embed.patches_resolution
self.patches_resolution = patches_resolution
# absolute position embedding
if self.ape:
self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
trunc_normal_(self.absolute_pos_embed, std=.02)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
# build layers
self.layers = nn.ModuleList()
for i_layer in range(self.num_layers):
layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
input_resolution=(patches_resolution[0] // (2 ** i_layer),
patches_resolution[1] // (2 ** i_layer)),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=self.mlp_ratio,
qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
use_checkpoint=use_checkpoint,
fused_window_process=fused_window_process)
self.layers.append(layer)
self.norm = norm_layer(self.num_features)
self.avgpool = nn.AdaptiveAvgPool1d(1)
self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
@torch.jit.ignore
def no_weight_decay(self):
return {'absolute_pos_embed'}
@torch.jit.ignore
def no_weight_decay_keywords(self):
return {'relative_position_bias_table'}
def forward_features(self, x):
x = self.patch_embed(x)
if self.ape:
x = x + self.absolute_pos_embed
x = self.pos_drop(x)
for layer in self.layers:
x = layer(x)
x = self.norm(x) # B L C
x = self.avgpool(x.transpose(1, 2)) # B C 1
x = torch.flatten(x, 1)
return x
def forward(self, x):
x = self.forward_features(x)
x = self.head(x)
return x
def flops(self):
flops = 0
flops += self.patch_embed.flops()
for i, layer in enumerate(self.layers):
flops += layer.flops()
flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
flops += self.num_features * self.num_classes
return flops
================================================
FILE: models/models/backbones/swin_transformer_moe.py
================================================
# --------------------------------------------------------
# Swin Transformer MoE
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ze Liu
# --------------------------------------------------------
import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
try:
from tutel import moe as tutel_moe
except:
tutel_moe = None
print("Tutel has not been installed. To use Swin-MoE, please install Tutel; otherwise, just ignore this.")
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.,
mlp_fc2_bias=True):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features, bias=mlp_fc2_bias)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class MoEMlp(nn.Module):
def __init__(self, in_features, hidden_features, num_local_experts, top_value, capacity_factor=1.25,
cosine_router=False, normalize_gate=False, use_bpr=True, is_gshard_loss=True,
gate_noise=1.0, cosine_router_dim=256, cosine_router_init_t=0.5, moe_drop=0.0, init_std=0.02,
mlp_fc2_bias=True):
super().__init__()
self.in_features = in_features
self.hidden_features = hidden_features
self.num_local_experts = num_local_experts
self.top_value = top_value
self.capacity_factor = capacity_factor
self.cosine_router = cosine_router
self.normalize_gate = normalize_gate
self.use_bpr = use_bpr
self.init_std = init_std
self.mlp_fc2_bias = mlp_fc2_bias
self.dist_rank = dist.get_rank()
self._dropout = nn.Dropout(p=moe_drop)
_gate_type = {'type': 'cosine_top' if cosine_router else 'top',
'k': top_value, 'capacity_factor': capacity_factor,
'gate_noise': gate_noise, 'fp32_gate': True}
if cosine_router:
_gate_type['proj_dim'] = cosine_router_dim
_gate_type['init_t'] = cosine_router_init_t
self._moe_layer = tutel_moe.moe_layer(
gate_type=_gate_type,
model_dim=in_features,
experts={'type': 'ffn', 'count_per_node': num_local_experts, 'hidden_size_per_expert': hidden_features,
'activation_fn': lambda x: self._dropout(F.gelu(x))},
scan_expert_func=lambda name, param: setattr(param, 'skip_allreduce', True),
seeds=(1, self.dist_rank + 1, self.dist_rank + 1),
batch_prioritized_routing=use_bpr,
normalize_gate=normalize_gate,
is_gshard_loss=is_gshard_loss,
)
if not self.mlp_fc2_bias:
self._moe_layer.experts.batched_fc2_bias.requires_grad = False
def forward(self, x):
x = self._moe_layer(x)
return x, x.l_aux
def extra_repr(self) -> str:
return f'[Statistics-{self.dist_rank}] param count for MoE, ' \
f'in_features = {self.in_features}, hidden_features = {self.hidden_features}, ' \
f'num_local_experts = {self.num_local_experts}, top_value = {self.top_value}, ' \
f'cosine_router={self.cosine_router} normalize_gate={self.normalize_gate}, use_bpr = {self.use_bpr}'
def _init_weights(self):
if hasattr(self._moe_layer, "experts"):
trunc_normal_(self._moe_layer.experts.batched_fc1_w, std=self.init_std)
trunc_normal_(self._moe_layer.experts.batched_fc2_w, std=self.init_std)
nn.init.constant_(self._moe_layer.experts.batched_fc1_bias, 0)
nn.init.constant_(self._moe_layer.experts.batched_fc2_bias, 0)
def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size
Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows
def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image
Returns:
x: (B, H, W, C)
"""
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
return x
class WindowAttention(nn.Module):
r""" Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.
Args:
dim (int): Number of input channels.
window_size (tuple[int]): The height and width of the window.
num_heads (int): Number of attention heads.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
pretrained_window_size (tuple[int]): The height and width of the window in pre-training.
"""
def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.,
pretrained_window_size=[0, 0]):
super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.pretrained_window_size = pretrained_window_size
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim ** -0.5
# mlp to generate continuous relative position bias
self.cpb_mlp = nn.Sequential(nn.Linear(2, 512, bias=True),
nn.ReLU(inplace=True),
nn.Linear(512, num_heads, bias=False))
# get relative_coords_table
relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.float32)
relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.float32)
relative_coords_table = torch.stack(
torch.meshgrid([relative_coords_h,
relative_coords_w])).permute(1, 2, 0).contiguous().unsqueeze(0) # 1, 2*Wh-1, 2*Ww-1, 2
if pretrained_window_size[0] > 0:
relative_coords_table[:, :, :, 0] /= (pretrained_window_size[0] - 1)
relative_coords_table[:, :, :, 1] /= (pretrained_window_size[1] - 1)
else:
relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1)
relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1)
relative_coords_table *= 8 # normalize to -8, 8
relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
torch.abs(relative_coords_table) + 1.0) / np.log2(8)
self.register_buffer("relative_coords_table", relative_coords_table)
# get pair-wise relative position index for each token inside the window
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.register_buffer("relative_position_index", relative_position_index)
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x, mask=None):
"""
Args:
x: input features with shape of (num_windows*B, N, C)
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
"""
B_, N, C = x.shape
qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
q = q * self.scale
attn = (q @ k.transpose(-2, -1))
relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads)
relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if mask is not None:
nW = mask.shape[0]
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.view(-1, self.num_heads, N, N)
attn = self.softmax(attn)
else:
attn = self.softmax(attn)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
def extra_repr(self) -> str:
return f'dim={self.dim}, window_size={self.window_size}, ' \
f'pretrained_window_size={self.pretrained_window_size}, num_heads={self.num_heads}'
def flops(self, N):
# calculate flops for 1 window with token length of N
flops = 0
# qkv = self.qkv(x)
flops += N * self.dim * 3 * self.dim
# attn = (q @ k.transpose(-2, -1))
flops += self.num_heads * N * (self.dim // self.num_heads) * N
# x = (attn @ v)
flops += self.num_heads * N * N * (self.dim // self.num_heads)
# x = self.proj(x)
flops += N * self.dim * self.dim
return flops
class SwinTransformerBlock(nn.Module):
r""" Swin Transformer Block.
Args:
dim (int): Number of input channels.
input_resolution (tuple[int]): Input resulotion.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
mlp_fc2_bias (bool): Whether to add bias in fc2 of Mlp. Default: True
init_std: Initialization std. Default: 0.02
pretrained_window_size (int): Window size in pre-training.
is_moe (bool): If True, this block is a MoE block.
num_local_experts (int): number of local experts in each device (GPU). Default: 1
top_value (int): the value of k in top-k gating. Default: 1
capacity_factor (float): the capacity factor in MoE. Default: 1.25
cosine_router (bool): Whether to use cosine router. Default: False
normalize_gate (bool): Whether to normalize the gating score in top-k gating. Default: False
use_bpr (bool): Whether to use batch-prioritized-routing. Default: True
is_gshard_loss (bool): If True, use Gshard balance loss.
If False, use the load loss and importance loss in "arXiv:1701.06538". Default: False
gate_noise (float): the noise ratio in top-k gating. Default: 1.0
cosine_router_dim (int): Projection dimension in cosine router.
cosine_router_init_t (float): Initialization temperature in cosine router.
moe_drop (float): Dropout rate in MoE. Default: 0.0
"""
def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
act_layer=nn.GELU, norm_layer=nn.LayerNorm, mlp_fc2_bias=True, init_std=0.02, pretrained_window_size=0,
is_moe=False, num_local_experts=1, top_value=1, capacity_factor=1.25, cosine_router=False,
normalize_gate=False, use_bpr=True, is_gshard_loss=True, gate_noise=1.0,
cosine_router_dim=256, cosine_router_init_t=0.5, moe_drop=0.0):
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
self.is_moe = is_moe
self.capacity_factor = capacity_factor
self.top_value = top_value
if min(self.input_resolution) <= self.window_size:
# if window size is larger than input resolution, we don't partition windows
self.shift_size = 0
self.window_size = min(self.input_resolution)
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
self.norm1 = norm_layer(dim)
self.attn = WindowAttention(
dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop,
pretrained_window_size=to_2tuple(pretrained_window_size))
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
if self.is_moe:
self.mlp = MoEMlp(in_features=dim,
hidden_features=mlp_hidden_dim,
num_local_experts=num_local_experts,
top_value=top_value,
capacity_factor=capacity_factor,
cosine_router=cosine_router,
normalize_gate=normalize_gate,
use_bpr=use_bpr,
is_gshard_loss=is_gshard_loss,
gate_noise=gate_noise,
cosine_router_dim=cosine_router_dim,
cosine_router_init_t=cosine_router_init_t,
moe_drop=moe_drop,
mlp_fc2_bias=mlp_fc2_bias,
init_std=init_std)
else:
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop,
mlp_fc2_bias=mlp_fc2_bias)
if self.shift_size > 0:
# calculate attention mask for SW-MSA
H, W = self.input_resolution
img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1
h_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
w_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1
mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
else:
attn_mask = None
self.register_buffer("attn_mask", attn_mask)
def forward(self, x):
H, W = self.input_resolution
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
shortcut = x
x = self.norm1(x)
x = x.view(B, H, W, C)
# cyclic shift
if self.shift_size > 0:
shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
else:
shifted_x = x
# partition windows
x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
# W-MSA/SW-MSA
attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C
# merge windows
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C
# reverse cyclic shift
if self.shift_size > 0:
x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
else:
x = shifted_x
x = x.view(B, H * W, C)
x = shortcut + self.drop_path(x)
# FFN
shortcut = x
x = self.norm2(x)
if self.is_moe:
x, l_aux = self.mlp(x)
x = shortcut + self.drop_path(x)
return x, l_aux
else:
x = shortcut + self.drop_path(self.mlp(x))
return x
def extra_repr(self) -> str:
return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
def flops(self):
flops = 0
H, W = self.input_resolution
# norm1
flops += self.dim * H * W
# W-MSA/SW-MSA
nW = H * W / self.window_size / self.window_size
flops += nW * self.attn.flops(self.window_size * self.window_size)
# mlp
if self.is_moe:
flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio * self.capacity_factor * self.top_value
else:
flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
# norm2
flops += self.dim * H * W
return flops
class PatchMerging(nn.Module):
r""" Patch Merging Layer.
Args:
input_resolution (tuple[int]): Resolution of input feature.
dim (int): Number of input channels.
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
super().__init__()
self.input_resolution = input_resolution
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
self.norm = norm_layer(4 * dim)
def forward(self, x):
"""
x: B, H*W, C
"""
H, W = self.input_resolution
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
x = x.view(B, H, W, C)
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
x = self.norm(x)
x = self.reduction(x)
return x
def extra_repr(self) -> str:
return f"input_resolution={self.input_resolution}, dim={self.dim}"
def flops(self):
H, W = self.input_resolution
flops = H * W * self.dim
flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
return flops
class BasicLayer(nn.Module):
""" A basic Swin Transformer layer for one stage.
Args:
dim (int): Number of input channels.
input_resolution (tuple[int]): Input resolution.
depth (int): Number of blocks.
num_heads (int): Number of attention heads.
window_size (int): Local window size.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
mlp_fc2_bias (bool): Whether to add bias in fc2 of Mlp. Default: True
init_std: Initialization std. Default: 0.02
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
pretrained_window_size (int): Local window size in pre-training.
moe_blocks (tuple(int)): The index of each MoE block.
num_local_experts (int): number of local experts in each device (GPU). Default: 1
top_value (int): the value of k in top-k gating. Default: 1
capacity_factor (float): the capacity factor in MoE. Default: 1.25
cosine_router (bool): Whether to use cosine router Default: False
normalize_gate (bool): Whether to normalize the gating score in top-k gating. Default: False
use_bpr (bool): Whether to use batch-prioritized-routing. Default: True
is_gshard_loss (bool): If True, use Gshard balance loss.
If False, use the load loss and importance loss in "arXiv:1701.06538". Default: False
gate_noise (float): the noise ratio in top-k gating. Default: 1.0
cosine_router_dim (int): Projection dimension in cosine router.
cosine_router_init_t (float): Initialization temperature in cosine router.
moe_drop (float): Dropout rate in MoE. Default: 0.0
"""
def __init__(self, dim, input_resolution, depth, num_heads, window_size,
mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
drop_path=0., norm_layer=nn.LayerNorm, downsample=None,
mlp_fc2_bias=True, init_std=0.02, use_checkpoint=False, pretrained_window_size=0,
moe_block=[-1], num_local_experts=1, top_value=1, capacity_factor=1.25, cosine_router=False,
normalize_gate=False, use_bpr=True, is_gshard_loss=True,
cosine_router_dim=256, cosine_router_init_t=0.5, gate_noise=1.0, moe_drop=0.0):
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.depth = depth
self.use_checkpoint = use_checkpoint
# build blocks
self.blocks = nn.ModuleList([
SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
num_heads=num_heads, window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop, attn_drop=attn_drop,
drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
norm_layer=norm_layer,
mlp_fc2_bias=mlp_fc2_bias,
init_std=init_std,
pretrained_window_size=pretrained_window_size,
is_moe=True if i in moe_block else False,
num_local_experts=num_local_experts,
top_value=top_value,
capacity_factor=capacity_factor,
cosine_router=cosine_router,
normalize_gate=normalize_gate,
use_bpr=use_bpr,
is_gshard_loss=is_gshard_loss,
gate_noise=gate_noise,
cosine_router_dim=cosine_router_dim,
cosine_router_init_t=cosine_router_init_t,
moe_drop=moe_drop)
for i in range(depth)])
# patch merging layer
if downsample is not None:
self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
else:
self.downsample = None
def forward(self, x):
l_aux = 0.0
for blk in self.blocks:
if self.use_checkpoint:
out = checkpoint.checkpoint(blk, x)
else:
out = blk(x)
if isinstance(out, tuple):
x = out[0]
cur_l_aux = out[1]
l_aux = cur_l_aux + l_aux
else:
x = out
if self.downsample is not None:
x = self.downsample(x)
return x, l_aux
def extra_repr(self) -> str:
return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
def flops(self):
flops = 0
for blk in self.blocks:
flops += blk.flops()
if self.downsample is not None:
flops += self.downsample.flops()
return flops
class PatchEmbed(nn.Module):
r""" Image to Patch Embedding
Args:
img_size (int): Image size. Default: 224.
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Module, optional): Normalization layer. Default: None
"""
def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
self.img_size = img_size
self.patch_size = patch_size
self.patches_resolution = patches_resolution
self.num_patches = patches_resolution[0] * patches_resolution[1]
self.in_chans = in_chans
self.embed_dim = embed_dim
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
B, C, H, W = x.shape
# FIXME look at relaxing size constraints
assert H == self.img_size[0] and W == self.img_size[1], \
f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C
if self.norm is not None:
x = self.norm(x)
return x
def flops(self):
Ho, Wo = self.patches_resolution
flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
if self.norm is not None:
flops += Ho * Wo * self.embed_dim
return flops
class SwinTransformerMoE(nn.Module):
r""" Swin Transformer
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
https://arxiv.org/pdf/2103.14030
Args:
img_size (int | tuple(int)): Input image size. Default 224
patch_size (int | tuple(int)): Patch size. Default: 4
in_chans (int): Number of input image channels. Default: 3
num_classes (int): Number of classes for classification head. Default: 1000
embed_dim (int): Patch embedding dimension. Default: 96
depths (tuple(int)): Depth of each Swin Transformer layer.
num_heads (tuple(int)): Number of attention heads in different layers.
window_size (int): Window size. Default: 7
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
drop_rate (float): Dropout rate. Default: 0
attn_drop_rate (float): Attention dropout rate. Default: 0
drop_path_rate (float): Stochastic depth rate. Default: 0.1
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
patch_norm (bool): If True, add normalization after patch embedding. Default: True
mlp_fc2_bias (bool): Whether to add bias in fc2 of Mlp. Default: True
init_std: Initialization std. Default: 0.02
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
pretrained_window_sizes (tuple(int)): Pretrained window sizes of each layer.
moe_blocks (tuple(tuple(int))): The index of each MoE block in each layer.
num_local_experts (int): number of local experts in each device (GPU). Default: 1
top_value (int): the value of k in top-k gating. Default: 1
capacity_factor (float): the capacity factor in MoE. Default: 1.25
cosine_router (bool): Whether to use cosine router Default: False
normalize_gate (bool): Whether to normalize the gating score in top-k gating. Default: False
use_bpr (bool): Whether to use batch-prioritized-routing. Default: True
is_gshard_loss (bool): If True, use Gshard balance loss.
If False, use the load loss and importance loss in "arXiv:1701.06538". Default: False
gate_noise (float): the noise ratio in top-k gating. Default: 1.0
cosine_router_dim (int): Projection dimension in cosine router.
cosine_router_init_t (float): Initialization temperature in cosine router.
moe_drop (float): Dropout rate in MoE. Default: 0.0
aux_loss_weight (float): auxiliary loss weight. Default: 0.1
"""
def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
mlp_fc2_bias=True, init_std=0.02, use_checkpoint=False, pretrained_window_sizes=[0, 0, 0, 0],
moe_blocks=[[-1], [-1], [-1], [-1]], num_local_experts=1, top_value=1, capacity_factor=1.25,
cosine_router=False, normalize_gate=False, use_bpr=True, is_gshard_loss=True, gate_noise=1.0,
cosine_router_dim=256, cosine_router_init_t=0.5, moe_drop=0.0, aux_loss_weight=0.01, **kwargs):
super().__init__()
self._ddp_params_and_buffers_to_ignore = list()
self.num_classes = num_classes
self.num_layers = len(depths)
self.embed_dim = embed_dim
self.ape = ape
self.patch_norm = patch_norm
self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
self.mlp_ratio = mlp_ratio
self.init_std = init_std
self.aux_loss_weight = aux_loss_weight
self.num_local_experts = num_local_experts
self.global_experts = num_local_experts * dist.get_world_size() if num_local_experts > 0 \
else dist.get_world_size() // (-num_local_experts)
self.sharded_count = (1.0 / num_local_experts) if num_local_experts > 0 else (-num_local_experts)
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None)
num_patches = self.patch_embed.num_patches
patches_resolution = self.patch_embed.patches_resolution
self.patches_resolution = patches_resolution
# absolute position embedding
if self.ape:
self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
trunc_normal_(self.absolute_pos_embed, std=self.init_std)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
# build layers
self.layers = nn.ModuleList()
for i_layer in range(self.num_layers):
layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
input_resolution=(patches_resolution[0] // (2 ** i_layer),
patches_resolution[1] // (2 ** i_layer)),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=self.mlp_ratio,
qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
mlp_fc2_bias=mlp_fc2_bias,
init_std=init_std,
use_checkpoint=use_checkpoint,
pretrained_window_size=pretrained_window_sizes[i_layer],
moe_block=moe_blocks[i_layer],
num_local_experts=num_local_experts,
top_value=top_value,
capacity_factor=capacity_factor,
cosine_router=cosine_router,
normalize_gate=normalize_gate,
use_bpr=use_bpr,
is_gshard_loss=is_gshard_loss,
gate_noise=gate_noise,
cosine_router_dim=cosine_router_dim,
cosine_router_init_t=cosine_router_init_t,
moe_drop=moe_drop)
self.layers.append(layer)
self.norm = norm_layer(self.num_features)
self.avgpool = nn.AdaptiveAvgPool1d(1)
self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=self.init_std)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, MoEMlp):
m._init_weights()
@torch.jit.ignore
def no_weight_decay(self):
return {'absolute_pos_embed'}
@torch.jit.ignore
def no_weight_decay_keywords(self):
return {"cpb_mlp", 'relative_position_bias_table', 'fc1_bias', 'fc2_bias',
'temperature', 'cosine_projector', 'sim_matrix'}
def forward_features(self, x):
x = self.patch_embed(x)
if self.ape:
x = x + self.absolute_pos_embed
x = self.pos_drop(x)
l_aux = 0.0
for layer in self.layers:
x, cur_l_aux = layer(x)
l_aux = cur_l_aux + l_aux
x = self.norm(x) # B L C
x = self.avgpool(x.transpose(1, 2)) # B C 1
x = torch.flatten(x, 1)
return x, l_aux
def forward(self, x):
x, l_aux = self.forward_features(x)
x = self.head(x)
return x, l_aux * self.aux_loss_weight
def add_param_to_skip_allreduce(self, param_name):
self._ddp_params_and_buffers_to_ignore.append(param_name)
def flops(self):
flops = 0
flops += self.patch_embed.flops()
for i, layer in enumerate(self.layers):
flops += layer.flops()
flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
flops += self.num_features * self.num_classes
return flops
================================================
FILE: models/models/backbones/swin_transformer_v2.py
================================================
# --------------------------------------------------------
# Swin Transformer V2
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ze Liu
# --------------------------------------------------------
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as checkpoint
from mmpose.models.builder import BACKBONES
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size
Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
return windows
def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image
Returns:
x: (B, H, W, C)
"""
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
return x
class WindowAttention(nn.Module):
r""" Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.
Args:
dim (int): Number of input channels.
window_size (tuple[int]): The height and width of the window.
num_heads (int): Number of attention heads.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
pretrained_window_size (tuple[int]): The height and width of the window in pre-training.
"""
def __init__(self, dim, window_size, num_heads, qkv_bias=True, attn_drop=0., proj_drop=0.,
pretrained_window_size=[0, 0]):
super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.pretrained_window_size = pretrained_window_size
self.num_heads = num_heads
self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True)
# mlp to generate continuous relative position bias
self.cpb_mlp = nn.Sequential(nn.Linear(2, 512, bias=True),
nn.ReLU(inplace=True),
nn.Linear(512, num_heads, bias=False))
# get relative_coords_table
relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.float32)
relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.float32)
relative_coords_table = torch.stack(
torch.meshgrid([relative_coords_h,
relative_coords_w])).permute(1, 2, 0).contiguous().unsqueeze(0) # 1, 2*Wh-1, 2*Ww-1, 2
if pretrained_window_size[0] > 0:
relative_coords_table[:, :, :, 0] /= (pretrained_window_size[0] - 1)
relative_coords_table[:, :, :, 1] /= (pretrained_window_size[1] - 1)
else:
relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1)
relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1)
relative_coords_table *= 8 # normalize to -8, 8
relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
torch.abs(relative_coords_table) + 1.0) / np.log2(8)
self.register_buffer("relative_coords_table", relative_coords_table)
# get pair-wise relative position index for each token inside the window
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.register_buffer("relative_position_index", relative_position_index)
self.qkv = nn.Linear(dim, dim * 3, bias=False)
if qkv_bias:
self.q_bias = nn.Parameter(torch.zeros(dim))
self.v_bias = nn.Parameter(torch.zeros(dim))
else:
self.q_bias = None
self.v_bias = None
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x, mask=None):
"""
Args:
x: input features with shape of (num_windows*B, N, C)
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
"""
B_, N, C = x.shape
qkv_bias = None
if self.q_bias is not None:
qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
qkv = qkv.reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
# cosine attention
attn = (F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1))
logit_scale = torch.clamp(self.logit_scale, max=torch.log(torch.tensor(1. / 0.01, device=x.device))).exp()
attn = attn * logit_scale
relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads)
relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
relative_position_bias = 16 * torch.sigmoid(relative_position_bias)
attn = attn + relative_position_bias.unsqueeze(0)
if mask is not None:
nW = mask.shape[0]
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.view(-1, self.num_heads, N, N)
attn = self.softmax(attn)
else:
attn = self.softmax(attn)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
def extra_repr(self) -> str:
return f'dim={self.dim}, window_size={self.window_size}, ' \
f'pretrained_window_size={self.pretrained_window_size}, num_heads={self.num_heads}'
def flops(self, N):
# calculate flops for 1 window with token length of N
flops = 0
# qkv = self.qkv(x)
flops += N * self.dim * 3 * self.dim
# attn = (q @ k.transpose(-2, -1))
flops += self.num_heads * N * (self.dim // self.num_heads) * N
# x = (attn @ v)
flops += self.num_heads * N * N * (self.dim // self.num_heads)
# x = self.proj(x)
flops += N * self.dim * self.dim
return flops
class SwinTransformerBlock(nn.Module):
r""" Swin Transformer Block.
Args:
dim (int): Number of input channels.
input_resolution (tuple[int]): Input resulotion.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
pretrained_window_size (int): Window size in pre-training.
"""
def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0.,
act_layer=nn.GELU, norm_layer=nn.LayerNorm, pretrained_window_size=0):
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
if min(self.input_resolution) <= self.window_size:
# if window size is larger than input resolution, we don't partition windows
self.shift_size = 0
self.window_size = min(self.input_resolution)
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
self.norm1 = norm_layer(dim)
self.attn = WindowAttention(
dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop,
pretrained_window_size=to_2tuple(pretrained_window_size))
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
if self.shift_size > 0:
# calculate attention mask for SW-MSA
H, W = self.input_resolution
img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1
h_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
w_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1
mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
else:
attn_mask = None
self.register_buffer("attn_mask", attn_mask)
def forward(self, x):
H, W = self.input_resolution
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
shortcut = x
x = x.view(B, H, W, C)
# cyclic shift
if self.shift_size > 0:
shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
else:
shifted_x = x
# partition windows
x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
# W-MSA/SW-MSA
attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C
# merge windows
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C
# reverse cyclic shift
if self.shift_size > 0:
x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
else:
x = shifted_x
x = x.view(B, H * W, C)
x = shortcut + self.drop_path(self.norm1(x))
# FFN
x = x + self.drop_path(self.norm2(self.mlp(x)))
return x
def extra_repr(self) -> str:
return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
def flops(self):
flops = 0
H, W = self.input_resolution
# norm1
flops += self.dim * H * W
# W-MSA/SW-MSA
nW = H * W / self.window_size / self.window_size
flops += nW * self.attn.flops(self.window_size * self.window_size)
# mlp
flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
# norm2
flops += self.dim * H * W
return flops
class PatchMerging(nn.Module):
r""" Patch Merging Layer.
Args:
input_resolution (tuple[int]): Resolution of input feature.
dim (int): Number of input channels.
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
super().__init__()
self.input_resolution = input_resolution
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
self.norm = norm_layer(2 * dim)
def forward(self, x):
"""
x: B, H*W, C
"""
H, W = self.input_resolution
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
x = x.view(B, H, W, C)
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
x = self.reduction(x)
x = self.norm(x)
return x
def extra_repr(self) -> str:
return f"input_resolution={self.input_resolution}, dim={self.dim}"
def flops(self):
H, W = self.input_resolution
flops = (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
flops += H * W * self.dim // 2
return flops
class BasicLayer(nn.Module):
""" A basic Swin Transformer layer for one stage.
Args:
dim (int): Number of input channels.
input_resolution (tuple[int]): Input resolution.
depth (int): Number of blocks.
num_heads (int): Number of attention heads.
window_size (int): Local window size.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
pretrained_window_size (int): Local window size in pre-training.
"""
def __init__(self, dim, input_resolution, depth, num_heads, window_size,
mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0.,
drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False,
pretrained_window_size=0):
super().__init__()
self.dim = dim
self.input_resolution = input_resolution
self.depth = depth
self.use_checkpoint = use_checkpoint
# build blocks
self.blocks = nn.ModuleList([
SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
num_heads=num_heads, window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
drop=drop, attn_drop=attn_drop,
drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
norm_layer=norm_layer,
pretrained_window_size=pretrained_window_size)
for i in range(depth)])
# patch merging layer
if downsample is not None:
self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
else:
self.downsample = None
def forward(self, x):
for blk in self.blocks:
if self.use_checkpoint:
x = checkpoint.checkpoint(blk, x)
else:
x = blk(x)
if self.downsample is not None:
x = self.downsample(x)
return x
def extra_repr(self) -> str:
return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
def flops(self):
flops = 0
for blk in self.blocks:
flops += blk.flops()
if self.downsample is not None:
flops += self.downsample.flops()
return flops
def _init_respostnorm(self):
for blk in self.blocks:
nn.init.constant_(blk.norm1.bias, 0)
nn.init.constant_(blk.norm1.weight, 0)
nn.init.constant_(blk.norm2.bias, 0)
nn.init.constant_(blk.norm2.weight, 0)
class PatchEmbed(nn.Module):
r""" Image to Patch Embedding
Args:
img_size (int): Image size. Default: 224.
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Module, optional): Normalization layer. Default: None
"""
def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
self.img_size = img_size
self.patch_size = patch_size
self.patches_resolution = patches_resolution
self.num_patches = patches_resolution[0] * patches_resolution[1]
self.in_chans = in_chans
self.embed_dim = embed_dim
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
B, C, H, W = x.shape
# FIXME look at relaxing size constraints
assert H == self.img_size[0] and W == self.img_size[1], \
f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C
if self.norm is not None:
x = self.norm(x)
return x
def flops(self):
Ho, Wo = self.patches_resolution
flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
if self.norm is not None:
flops += Ho * Wo * self.embed_dim
return flops
@BACKBONES.register_module()
class SwinTransformerV2(nn.Module):
r""" Swin Transformer
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
https://arxiv.org/pdf/2103.14030
Args:
img_size (int | tuple(int)): Input image size. Default 224
patch_size (int | tuple(int)): Patch size. Default: 4
in_chans (int): Number of input image channels. Default: 3
num_classes (int): Number of classes for classification head. Default: 1000
embed_dim (int): Patch embedding dimension. Default: 96
depths (tuple(int)): Depth of each Swin Transformer layer.
num_heads (tuple(int)): Number of attention heads in different layers.
window_size (int): Window size. Default: 7
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
drop_rate (float): Dropout rate. Default: 0
attn_drop_rate (float): Attention dropout rate. Default: 0
drop_path_rate (float): Stochastic depth rate. Default: 0.1
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
patch_norm (bool): If True, add normalization after patch embedding. Default: True
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
pretrained_window_sizes (tuple(int)): Pretrained window sizes of each layer.
"""
def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
window_size=7, mlp_ratio=4., qkv_bias=True,
drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
use_checkpoint=False, pretrained_window_sizes=[0, 0, 0, 0],
multi_scale=False, upsample='deconv', **kwargs):
super().__init__()
self.num_classes = num_classes
self.num_layers = len(depths)
self.embed_dim = embed_dim
self.ape = ape
self.patch_norm = patch_norm
self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
self.mlp_ratio = mlp_ratio
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None)
num_patches = self.patch_embed.num_patches
patches_resolution = self.patch_embed.patches_resolution
self.patches_resolution = patches_resolution
# absolute position embedding
if self.ape:
self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
trunc_normal_(self.absolute_pos_embed, std=.02)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
# build layers
self.layers = nn.ModuleList()
for i_layer in range(self.num_layers):
layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
input_resolution=(patches_resolution[0] // (2 ** i_layer),
patches_resolution[1] // (2 ** i_layer)),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=self.mlp_ratio,
qkv_bias=qkv_bias,
drop=drop_rate, attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
use_checkpoint=use_checkpoint,
pretrained_window_size=pretrained_window_sizes[i_layer])
self.layers.append(layer)
self.norm = norm_layer(self.num_features)
self.avgpool = nn.AdaptiveAvgPool1d(1)
self.multi_scale = multi_scale
if self.multi_scale:
self.scales = [1, 2, 4, 4]
self.upsample = nn.ModuleList()
features = [int(embed_dim * 2 ** i) for i in range(1, self.num_layers)] + [self.num_features]
self.multi_scale_fuse = nn.Conv2d(sum(features), self.num_features, 1)
for i in range(self.num_layers):
self.upsample.append(nn.Upsample(scale_factor=self.scales[i]))
else:
if upsample == 'deconv':
self.upsample = nn.ConvTranspose2d(self.num_features, self.num_features, 2, stride=2)
elif upsample == 'new_deconv':
self.upsample = nn.Sequential(nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
nn.Conv2d(self.num_features, self.num_features, 3, stride=1, padding=1),
nn.BatchNorm2d(self.num_features),
nn.ReLU(inplace=True)
)
elif upsample == 'new_deconv2':
self.upsample = nn.Sequential(nn.Upsample(scale_factor=2),
nn.Conv2d(self.num_features, self.num_features, 3, stride=1, padding=1),
nn.BatchNorm2d(self.num_features),
nn.ReLU(inplace=True)
)
elif upsample == 'bilinear':
self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False)
else:
self.upsample = nn.Identity()
self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
self.apply(self._init_weights)
for bly in self.layers:
bly._init_respostnorm()
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
@torch.jit.ignore
def no_weight_decay(self):
return {'absolute_pos_embed'}
@torch.jit.ignore
def no_weight_decay_keywords(self):
return {"cpb_mlp", "logit_scale", 'relative_position_bias_table'}
def forward_features(self, x):
B, C, H, W = x.shape
x = self.patch_embed(x)
if self.ape:
x = x + self.absolute_pos_embed
x = self.pos_drop(x)
if self.multi_scale:
# x_2d = x.view(B, H // 4, W // 4, -1).permute(0, 3, 1, 2) # B C H W
# features = [self.upsample[0](x_2d)]
features = []
for i, layer in enumerate(self.layers):
x = layer(x)
x_2d = x.view(B, H // (8 * self.scales[i]), W // (8 * self.scales[i]), -1).permute(0, 3, 1,
2) # B C H W
features.append(self.upsample[i](x_2d))
x = torch.cat(features, dim=1)
x = self.multi_scale_fuse(x)
x = x.view(B, self.num_features, -1).permute(0, 2, 1)
x = self.norm(x) # B L C
x = x.view(B, H // 8, W // 8, self.num_features).permute(0, 3, 1, 2) # B C H W
else:
for layer in self.layers:
x = layer(x)
x = self.norm(x) # B L C
x = x.view(B, H // 32, W // 32, self.num_features).permute(0, 3, 1, 2) # B C H W
x = self.upsample(x)
return x
def forward(self, x):
x = self.forward_features(x)
x = self.head(x)
return x
def flops(self):
flops = 0
flops += self.patch_embed.flops()
for i, layer in enumerate(self.layers):
flops += layer.flops()
flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
flops += self.num_features * self.num_classes
return flops
================================================
FILE: models/models/backbones/swin_utils.py
================================================
# --------------------------------------------------------
# SimMIM
# Copyright (c) 2021 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ze Liu
# Modified by Zhenda Xie
# --------------------------------------------------------
import numpy as np
import torch
from scipy import interpolate
def load_pretrained(config, model, logger):
checkpoint = torch.load(config, map_location='cpu')
checkpoint_model = checkpoint['model']
if any([True if 'encoder.' in k else False for k in checkpoint_model.keys()]):
checkpoint_model = {k.replace('encoder.', ''): v for k, v in checkpoint_model.items() if
k.startswith('encoder.')}
print('Detect pre-trained model, remove [encoder.] prefix.')
else:
print('Detect non-pre-trained model, pass without doing anything.')
checkpoint = remap_pretrained_keys_swin(model, checkpoint_model, logger)
msg = model.load_state_dict(checkpoint_model, strict=False)
print(msg)
del checkpoint
torch.cuda.empty_cache()
def remap_pretrained_keys_swin(model, checkpoint_model, logger):
state_dict = model.state_dict()
# Geometric interpolation when pre-trained patch size mismatch with fine-tuned patch size
all_keys = list(checkpoint_model.keys())
for key in all_keys:
if "relative_position_bias_table" in key:
relative_position_bias_table_pretrained = checkpoint_model[key]
relative_position_bias_table_current = state_dict[key]
L1, nH1 = relative_position_bias_table_pretrained.size()
L2, nH2 = relative_position_bias_table_current.size()
if nH1 != nH2:
print(f"Error in loading {key}, passing......")
else:
if L1 != L2:
print(f"{key}: Interpolate relative_position_bias_table using geo.")
src_size = int(L1 ** 0.5)
dst_size = int(L2 ** 0.5)
def geometric_progression(a, r, n):
return a * (1.0 - r ** n) / (1.0 - r)
left, right = 1.01, 1.5
while right - left > 1e-6:
q = (left + right) / 2.0
gp = geometric_progression(1, q, src_size // 2)
if gp > dst_size // 2:
right = q
else:
left = q
# if q > 1.090307:
# q = 1.090307
dis = []
cur = 1
for i in range(src_size // 2):
dis.append(cur)
cur += q ** (i + 1)
r_ids = [-_ for _ in reversed(dis)]
x = r_ids + [0] + dis
y = r_ids + [0] + dis
t = dst_size // 2.0
dx = np.arange(-t, t + 0.1, 1.0)
dy = np.arange(-t, t + 0.1, 1.0)
print("Original positions = %s" % str(x))
print("Target positions = %s" % str(dx))
all_rel_pos_bias = []
for i in range(nH1):
z = relative_position_bias_table_pretrained[:, i].view(src_size, src_size).float().numpy()
f_cubic = interpolate.interp2d(x, y, z, kind='cubic')
all_rel_pos_bias.append(torch.Tensor(f_cubic(dx, dy)).contiguous().view(-1, 1).to(
relative_position_bias_table_pretrained.device))
new_rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
checkpoint_model[key] = new_rel_pos_bias
# delete relative_position_index since we always re-init it
relative_position_index_keys = [k for k in checkpoint_model.keys() if "relative_position_index" in k]
for k in relative_position_index_keys:
del checkpoint_model[k]
# delete relative_coords_table since we always re-init it
relative_coords_table_keys = [k for k in checkpoint_model.keys() if "relative_coords_table" in k]
for k in relative_coords_table_keys:
del checkpoint_model[k]
# re-map keys due to name change
rpe_mlp_keys = [k for k in checkpoint_model.keys() if "rpe_mlp" in k]
for k in rpe_mlp_keys:
checkpoint_model[k.replace('rpe_mlp', 'cpb_mlp')] = checkpoint_model.pop(k)
# delete attn_mask since we always re-init it
attn_mask_keys = [k for k in checkpoint_model.keys() if "attn_mask" in k]
for k in attn_mask_keys:
del checkpoint_model[k]
return checkpoint_model
================================================
FILE: models/models/detectors/__init__.py
================================================
from .pam import PoseAnythingModel
__all__ = ['PoseAnythingModel']
================================================
FILE: models/models/detectors/pam.py
================================================
import numpy as np
import torch
from mmpose.models import builder
from mmpose.models.builder import POSENETS
from mmpose.models.detectors.base import BasePose
from models.models.backbones.swin_utils import load_pretrained
@POSENETS.register_module()
class PoseAnythingModel(BasePose):
"""Few-shot keypoint detectors.
Args:
keypoint_head (dict): Keypoint head to process feature.
encoder_config (dict): Config for encoder. Default: None.
pretrained (str): Path to the pretrained models.
train_cfg (dict): Config for training. Default: None.
test_cfg (dict): Config for testing. Default: None.
"""
def __init__(self,
keypoint_head,
encoder_config,
pretrained=False,
train_cfg=None,
test_cfg=None):
super().__init__()
self.backbone, self.backbone_type = self.init_backbone(pretrained, encoder_config)
self.keypoint_head = builder.build_head(keypoint_head)
self.keypoint_head.init_weights()
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.target_type = test_cfg.get('target_type',
'GaussianHeatMap') # GaussianHeatMap
def init_backbone(self, pretrained, encoder_config):
if 'swin' in pretrained:
encoder_sample = builder.build_backbone(encoder_config)
if '.pth' in pretrained:
load_pretrained(pretrained, encoder_sample, logger=None)
backbone = 'swin'
elif 'dino' in pretrained:
if 'dinov2' in pretrained:
repo = 'facebookresearch/dinov2'
backbone = 'dinov2'
else:
repo = 'facebookresearch/dino:main'
backbone = 'dino'
encoder_sample = torch.hub.load(repo, pretrained)
elif 'resnet' in pretrained:
pretrained = 'torchvision://resnet50'
encoder_config = dict(type='ResNet', depth=50, out_indices=(3,))
encoder_sample = builder.build_backbone(encoder_config)
encoder_sample.init_weights(pretrained)
backbone = 'resnet50'
else:
raise NotImplementedError(f'backbone {pretrained} not supported')
return encoder_sample, backbone
@property
def with_keypoint(self):
"""Check if has keypoint_head."""
return hasattr(self, 'keypoint_head')
def init_weights(self, pretrained=None):
"""Weight initialization for model."""
self.backbone.init_weights(pretrained)
self.encoder_query.init_weights(pretrained)
self.keypoint_head.init_weights()
def forward(self,
img_s,
img_q,
target_s=None,
target_weight_s=None,
target_q=None,
target_weight_q=None,
img_metas=None,
return_loss=True,
**kwargs):
"""Defines the computation performed at every call."""
if return_loss:
return self.forward_train(img_s, target_s, target_weight_s, img_q,
target_q, target_weight_q, img_metas,
**kwargs)
else:
return self.forward_test(img_s, target_s, target_weight_s, img_q,
target_q, target_weight_q, img_metas,
**kwargs)
def forward_dummy(self, img_s, target_s, target_weight_s, img_q, target_q,
target_weight_q, img_metas, **kwargs):
return self.predict(
img_s, target_s, target_weight_s, img_q, img_metas)
def forward_train(self,
img_s,
target_s,
target_weight_s,
img_q,
target_q,
target_weight_q,
img_metas,
**kwargs):
"""Defines the computation performed at every call when training."""
bs, _, h, w = img_q.shape
output, initial_proposals, similarity_map, mask_s = self.predict(
img_s, target_s, target_weight_s, img_q, img_metas)
# parse the img meta to get the target keypoints
target_keypoints = self.parse_keypoints_from_img_meta(img_metas, output.device, keyword='query')
target_sizes = torch.tensor([img_q.shape[-2], img_q.shape[-1]]).unsqueeze(0).repeat(img_q.shape[0], 1, 1)
# if return loss
losses = dict()
if self.with_keypoint:
keypoint_losses = self.keypoint_head.get_loss(
output, initial_proposals, similarity_map, target_keypoints,
target_q, target_weight_q * mask_s, target_sizes)
losses.update(keypoint_losses)
keypoint_accuracy = self.keypoint_head.get_accuracy(output[-1],
target_keypoints,
target_weight_q * mask_s,
target_sizes,
height=h)
losses.update(keypoint_accuracy)
return losses
def forward_test(self,
img_s,
target_s,
target_weight_s,
img_q,
target_q,
target_weight_q,
img_metas=None,
**kwargs):
"""Defines the computation performed at every call when testing."""
batch_size, _, img_height, img_width = img_q.shape
output, initial_proposals, similarity_map, _ = self.predict(img_s, target_s, target_weight_s, img_q, img_metas)
predicted_pose = output[-1].detach().cpu().numpy() # [bs, num_query, 2]
result = {}
if self.with_keypoint:
keypoint_result = self.keypoint_head.decode(img_metas, predicted_pose, img_size=[img_width, img_height])
result.update(keypoint_result)
result.update({
"points":
torch.cat((initial_proposals, output.squeeze(1))).cpu().numpy()
})
result.update({"sample_image_file": img_metas[0]['sample_image_file']})
return result
def predict(self,
img_s,
target_s,
target_weight_s,
img_q,
img_metas=None):
batch_size, _, img_height, img_width = img_q.shape
assert [i['sample_skeleton'][0] != i['query_skeleton'] for i in img_metas]
skeleton = [i['sample_skeleton'][0] for i in img_metas]
feature_q, feature_s = self.extract_features(img_s, img_q)
mask_s = target_weight_s[0]
for target_weight in target_weight_s:
mask_s = mask_s * target_weight
output, initial_proposals, similarity_map = self.keypoint_head(feature_q, feature_s, target_s, mask_s, skeleton)
return output, initial_proposals, similarity_map, mask_s
def extract_features(self, img_s, img_q):
if self.backbone_type == 'swin':
feature_q = self.backbone.forward_features(img_q) # [bs, C, h, w]
feature_s = [self.backbone.forward_features(img) for img in img_s]
elif self.backbone_type == 'dino':
batch_size, _, img_height, img_width = img_q.shape
feature_q = self.backbone.get_intermediate_layers(img_q, n=1)[0][:, 1:] \
.reshape(batch_size, img_height // 8, img_width // 8, -1).permute(0, 3, 1, 2) # [bs, 3, h, w]
feature_s = [self.backbone.get_intermediate_layers(img, n=1)[0][:, 1:].
reshape(batch_size, img_height // 8, img_width // 8, -1).permute(0, 3, 1, 2) for img in img_s]
elif self.backbone_type == 'dinov2':
batch_size, _, img_height, img_width = img_q.shape
feature_q = self.backbone.get_intermediate_layers(img_q, n=1, reshape=True)[0] # [bs, c, h, w]
feature_s = [self.backbone.get_intermediate_layers(img, n=1, reshape=True)[0] for img in img_s]
else:
feature_s = [self.backbone(img) for img in img_s]
feature_q = self.encoder_query(img_q)
return feature_q, feature_s
def parse_keypoints_from_img_meta(self, img_meta, device, keyword='query'):
"""Parse keypoints from the img_meta.
Args:
img_meta (dict): Image meta info.
device (torch.device): Device of the output keypoints.
keyword (str): 'query' or 'sample'. Default: 'query'.
Returns:
Tensor: Keypoints coordinates of query images.
"""
if keyword == 'query':
query_kpt = torch.stack([
torch.tensor(info[f'{keyword}_joints_3d']).to(device)
for info in img_meta
], dim=0)[:, :, :2] # [bs, num_query, 2]
else:
query_kpt = []
for info in img_meta:
if isinstance(info[f'{keyword}_joints_3d'][0], torch.Tensor):
samples = torch.stack(info[f'{keyword}_joints_3d'])
else:
samples = np.array(info[f'{keyword}_joints_3d'])
query_kpt.append(torch.tensor(samples).to(device)[:, :, :2])
query_kpt = torch.stack(query_kpt, dim=0) # [bs, , num_samples, num_query, 2]
return query_kpt
# UNMODIFIED
def show_result(self,
img,
result,
skeleton=None,
kpt_score_thr=0.3,
bbox_color='green',
pose_kpt_color=None,
pose_limb_color=None,
radius=4,
text_color=(255, 0, 0),
thickness=1,
font_scale=0.5,
win_name='',
show=False,
wait_time=0,
out_file=None):
"""Draw `result` over `img`.
Args:
img (str or Tensor): The image to be displayed.
result (list[dict]): The results to draw over `img`
(bbox_result, pose_result).
kpt_score_thr (float, optional): Minimum score of keypoints
to be shown. Default: 0.3.
bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
If None, do not draw keypoints.
pose_limb_color (np.array[Mx3]): Color of M limbs.
If None, do not draw limbs.
text_color (str or tuple or :obj:`Color`): Color of texts.
thickness (int): Thickness of lines.
font_scale (float): Font scales of texts.
win_name (str): The window name.
wait_time (int): Value of waitKey param.
Default: 0.
out_file (str or None): The filename to write the image.
Default: None.
Returns:
Tensor: Visualized img, only if not `show` or `out_file`.
"""
img = mmcv.imread(img)
img = img.copy()
img_h, img_w, _ = img.shape
bbox_result = []
pose_result = []
for res in result:
bbox_result.append(res['bbox'])
pose_result.append(res['keypoints'])
if len(bbox_result) > 0:
bboxes = np.vstack(bbox_result)
# draw bounding boxes
mmcv.imshow_bboxes(
img,
bboxes,
colors=bbox_color,
top_k=-1,
thickness=thickness,
show=False,
win_name=win_name,
wait_time=wait_time,
out_file=None)
for person_id, kpts in enumerate(pose_result):
# draw each point on image
if pose_kpt_color is not None:
assert len(pose_kpt_color) == len(kpts), (
len(pose_kpt_color), len(kpts))
for kid, kpt in enumerate(kpts):
x_coord, y_coord, kpt_score = int(kpt[0]), int(
kpt[1]), kpt[2]
if kpt_score > kpt_score_thr:
img_copy = img.copy()
r, g, b = pose_kpt_color[kid]
cv2.circle(img_copy, (int(x_coord), int(y_coord)),
radius, (int(r), int(g), int(b)), -1)
transparency = max(0, min(1, kpt_score))
cv2.addWeighted(
img_copy,
transparency,
img,
1 - transparency,
0,
dst=img)
# draw limbs
if skeleton is not None and pose_limb_color is not None:
assert len(pose_limb_color) == len(skeleton)
for sk_id, sk in enumerate(skeleton):
pos1 = (int(kpts[sk[0] - 1, 0]), int(kpts[sk[0] - 1,
1]))
pos2 = (int(kpts[sk[1] - 1, 0]), int(kpts[sk[1] - 1,
1]))
if (pos1[0] > 0 and pos1[0] < img_w and pos1[1] > 0
and pos1[1] < img_h and pos2[0] > 0
and pos2[0] < img_w and pos2[1] > 0
and pos2[1] < img_h
and kpts[sk[0] - 1, 2] > kpt_score_thr
and kpts[sk[1] - 1, 2] > kpt_score_thr):
img_copy = img.copy()
X = (pos1[0], pos2[0])
Y = (pos1[1], pos2[1])
mX = np.mean(X)
mY = np.mean(Y)
length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
angle = math.degrees(
math.atan2(Y[0] - Y[1], X[0] - X[1]))
stickwidth = 2
polygon = cv2.ellipse2Poly(
(int(mX), int(mY)),
(int(length / 2), int(stickwidth)), int(angle),
0, 360, 1)
r, g, b = pose_limb_color[sk_id]
cv2.fillConvexPoly(img_copy, polygon,
(int(r), int(g), int(b)))
transparency = max(
0,
min(
1, 0.5 *
(kpts[sk[0] - 1, 2] + kpts[sk[1] - 1, 2])))
cv2.addWeighted(
img_copy,
transparency,
img,
1 - transparency,
0,
dst=img)
show, wait_time = 1, 1
if show:
height, width = img.shape[:2]
max_ = max(height, width)
factor = min(1, 800 / max_)
enlarge = cv2.resize(
img, (0, 0),
fx=factor,
fy=factor,
interpolation=cv2.INTER_CUBIC)
imshow(enlarge, win_name, wait_time)
if out_file is not None:
imwrite(img, out_file)
return img
================================================
FILE: models/models/keypoint_heads/__init__.py
================================================
from .head import PoseHead
__all__ = ['PoseHead']
================================================
FILE: models/models/keypoint_heads/head.py
================================================
from copy import deepcopy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (Conv2d, Linear, xavier_init)
from mmcv.cnn.bricks.transformer import build_positional_encoding
from mmpose.core.evaluation import keypoint_pck_accuracy
from mmpose.core.post_processing import transform_preds
from mmpose.models import HEADS
from mmpose.models.utils.ops import resize
from models.models.utils import build_transformer
def inverse_sigmoid(x, eps=1e-3):
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1 / x2)
class TokenDecodeMLP(nn.Module):
'''
The MLP used to predict coordinates from the support keypoints tokens.
'''
def __init__(self,
in_channels,
hidden_channels,
out_channels=2,
num_layers=3):
super(TokenDecodeMLP, self).__init__()
layers = []
for i in range(num_layers):
if i == 0:
layers.append(nn.Linear(in_channels, hidden_channels))
layers.append(nn.GELU())
else:
layers.append(nn.Linear(hidden_channels, hidden_channels))
layers.append(nn.GELU())
layers.append(nn.Linear(hidden_channels, out_channels))
self.mlp = nn.Sequential(*layers)
def forward(self, x):
return self.mlp(x)
@HEADS.register_module()
class PoseHead(nn.Module):
'''
In two stage regression A3, the proposal generator are moved into transformer.
All valid proposals will be added with an positional embedding to better regress the location
'''
def __init__(self,
in_channels,
transformer=None,
positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=128,
normalize=True),
encoder_positional_encoding=dict(
type='SinePositionalEncoding',
num_feats=512,
normalize=True),
share_kpt_branch=False,
num_decoder_layer=3,
with_heatmap_loss=False,
with_bb_loss=False,
bb_temperature=0.2,
heatmap_loss_weight=2.0,
support_order_dropout=-1,
extra=None,
train_cfg=None,
test_cfg=None):
super().__init__()
self.in_channels = in_channels
self.positional_encoding = build_positional_encoding(positional_encoding)
self.encoder_positional_encoding = build_positional_encoding(encoder_positional_encoding)
self.transformer = build_transformer(transformer)
self.embed_dims = self.transformer.d_model
self.with_heatmap_loss = with_heatmap_loss
self.with_bb_loss = with_bb_loss
self.bb_temperature = bb_temperature
self.heatmap_loss_weight = heatmap_loss_weight
self.support_order_dropout = support_order_dropout
assert 'num_feats' in positional_encoding
num_feats = positional_encoding['num_feats']
assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
f' and {num_feats}.'
if extra is not None and not isinstance(extra, dict):
raise TypeError('extra should be dict or None.')
"""Initialize layers of the transformer head."""
self.input_proj = Conv2d(self.in_channels, self.embed_dims, kernel_size=1)
self.query_proj = Linear(self.in_channels, self.embed_dims)
# Instantiate the proposal generator and subsequent keypoint branch.
kpt_branch = TokenDecodeMLP(
in_channels=self.embed_dims, hidden_channels=self.embed_dims)
if share_kpt_branch:
self.kpt_branch = nn.ModuleList(
[kpt_branch for i in range(num_decoder_layer)])
else:
self.kpt_branch = nn.ModuleList(
[deepcopy(kpt_branch) for i in range(num_decoder_layer)])
self.train_cfg = {} if train_cfg is None else train_cfg
self.test_cfg = {} if test_cfg is None else test_cfg
self.target_type = self.test_cfg.get('target_type', 'GaussianHeatMap')
def init_weights(self):
for m in self.modules():
if hasattr(m, 'weight') and m.weight.dim() > 1:
xavier_init(m, distribution='uniform')
"""Initialize weights of the transformer head."""
# The initialization for transformer is important
self.transformer.init_weights()
# initialization for input_proj & prediction head
for mlp in self.kpt_branch:
nn.init.constant_(mlp.mlp[-1].weight.data, 0)
nn.init.constant_(mlp.mlp[-1].bias.data, 0)
nn.init.xavier_uniform_(self.input_proj.weight, gain=1)
nn.init.constant_(self.input_proj.bias, 0)
nn.init.xavier_uniform_(self.query_proj.weight, gain=1)
nn.init.constant_(self.query_proj.bias, 0)
def forward(self, x, feature_s, target_s, mask_s, skeleton):
""""Forward function for a single feature level.
Args:
x (Tensor): Input feature from backbone's single stage, shape
[bs, c, h, w].
Returns:
all_cls_scores (Tensor): Outputs from the classification head,
shape [nb_dec, bs, num_query, cls_out_channels]. Note
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression
head with normalized coordinate format (cx, cy, w, h).
Shape [nb_dec, bs, num_query, 4].
"""
# construct binary masks which used for the transformer.
# NOTE following the official DETR repo, non-zero values representing
# ignored positions, while zero values means valid positions.
# process query image feature
x = self.input_proj(x)
bs, dim, h, w = x.shape
# Disable the support keypoint positional embedding
support_order_embedding = x.new_zeros((bs, self.embed_dims, 1, target_s[0].shape[1])).to(torch.bool)
# Feature map pos embedding
masks = x.new_zeros((x.shape[0], x.shape[2], x.shape[3])).to(torch.bool)
pos_embed = self.positional_encoding(masks)
# process keypoint token feature
query_embed_list = []
for i, (feature, target) in enumerate(zip(feature_s, target_s)):
# resize the support feature back to the heatmap sizes.
resized_feature = resize(
input=feature,
size=target.shape[-2:],
mode='bilinear',
align_corners=False)
target = target / (target.sum(dim=-1).sum(dim=-1)[:, :, None, None] + 1e-8)
support_keypoints = target.flatten(2) @ resized_feature.flatten(2).permute(0, 2, 1)
query_embed_list.append(support_keypoints)
support_keypoints = torch.mean(torch.stack(query_embed_list, dim=0), 0)
support_keypoints = support_keypoints * mask_s
support_keypoints = self.query_proj(support_keypoints)
masks_query = (~mask_s.to(torch.bool)).squeeze(-1) # True indicating this query matched no actual joints.
# outs_dec: [nb_dec, bs, num_query, c]
# memory: [bs, c, h, w]
# x = Query image feature, support_keypoints = Support keypoint feature
outs_dec, initial_proposals, out_points, similarity_map = self.transformer(x,
masks,
support_keypoints,
pos_embed,
support_order_embedding,
masks_query,
self.positional_encoding,
self.kpt_branch,
skeleton)
output_kpts = []
for idx in range(outs_dec.shape[0]):
layer_delta_unsig = self.kpt_branch[idx](outs_dec[idx])
layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(
out_points[idx])
output_kpts.append(layer_outputs_unsig.sigmoid())
return torch.stack(output_kpts, dim=0), initial_proposals, similarity_map
def get_loss(self, output, initial_proposals, similarity_map, target, target_heatmap, target_weight, target_sizes):
# Calculate top-down keypoint loss.
losses = dict()
# denormalize the predicted coordinates.
num_dec_layer, bs, nq = output.shape[:3]
target_sizes = target_sizes.to(output.device) # [bs, 1, 2]
target = target / target_sizes
target = target[None, :, :, :].repeat(num_dec_layer, 1, 1, 1)
# set the weight for unset query point to be zero
normalizer = target_weight.squeeze(dim=-1).sum(dim=-1) # [bs, ]
normalizer[normalizer == 0] = 1
# compute the heatmap loss
if self.with_heatmap_loss:
losses['heatmap_loss'] = self.heatmap_loss(
similarity_map, target_heatmap, target_weight,
normalizer) * self.heatmap_loss_weight
# compute l1 loss for inital_proposals
proposal_l1_loss = F.l1_loss(
initial_proposals, target[0], reduction="none")
proposal_l1_loss = proposal_l1_loss.sum(
dim=-1, keepdim=False) * target_weight.squeeze(dim=-1)
proposal_l1_loss = proposal_l1_loss.sum(
dim=-1, keepdim=False) / normalizer # [bs, ]
losses['proposal_loss'] = proposal_l1_loss.sum() / bs
# compute l1 loss for each layer
for idx in range(num_dec_layer):
layer_output, layer_target = output[idx], target[idx]
l1_loss = F.l1_loss(
layer_output, layer_target, reduction="none") # [bs, query, 2]
l1_loss = l1_loss.sum(
dim=-1, keepdim=False) * target_weight.squeeze(
dim=-1) # [bs, query]
# normalize the loss for each sample with the number of visible joints
l1_loss = l1_loss.sum(dim=-1, keepdim=False) / normalizer # [bs, ]
losses['l1_loss' + '_layer' + str(idx)] = l1_loss.sum() / bs
return losses
def get_max_coords(self, heatmap, heatmap_size=64):
B, C, H, W = heatmap.shape
heatmap = heatmap.view(B, C, -1)
max_cor = heatmap.argmax(dim=2)
row, col = torch.floor(max_cor / heatmap_size), max_cor % heatmap_size
support_joints = torch.cat((row.unsqueeze(-1), col.unsqueeze(-1)), dim=-1)
return support_joints
def heatmap_loss(self, similarity_map, target_heatmap, target_weight,
normalizer):
# similarity_map: [bs, num_query, h, w]
# target_heatmap: [bs, num_query, sh, sw]
# target_weight: [bs, num_query, 1]
# preprocess the similarity_map
h, w = similarity_map.shape[-2:]
# similarity_map = torch.clamp(similarity_map, 0.0, None)
similarity_map = similarity_map.sigmoid()
target_heatmap = F.interpolate(
target_heatmap, size=(h, w), mode='bilinear')
target_heatmap = (target_heatmap /
(target_heatmap.max(dim=-1)[0].max(dim=-1)[0] + 1e-10)[:, :, None,
None]) # make sure sum of each query is 1
l2_loss = F.mse_loss(
similarity_map, target_heatmap, reduction="none") # bs, nq, h, w
l2_loss = l2_loss * target_weight[:, :, :, None] # bs, nq, h, w
l2_loss = l2_loss.flatten(2, 3).sum(-1) / (h * w) # bs, nq
l2_loss = l2_loss.sum(-1) / normalizer # bs,
return l2_loss.mean()
def get_accuracy(self, output, target, target_weight, target_sizes, height=256):
"""Calculate accuracy for top-down keypoint loss.
Args:
output (torch.Tensor[NxKx2]): estimated keypoints in ABSOLUTE coordinates.
target (torch.Tensor[NxKx2]): gt keypoints in ABSOLUTE coordinates.
target_weight (torch.Tensor[NxKx1]): Weights across different joint types.
target_sizes (torch.Tensor[Nx2): shapes of the image.
"""
# NOTE: In POMNet, PCK is estimated on 1/8 resolution, which is slightly different here.
accuracy = dict()
output = output * float(height)
output, target, target_weight, target_sizes = (
output.detach().cpu().numpy(), target.detach().cpu().numpy(),
target_weight.squeeze(-1).long().detach().cpu().numpy(),
target_sizes.squeeze(1).detach().cpu().numpy())
_, avg_acc, _ = keypoint_pck_accuracy(
output,
target,
target_weight.astype(np.bool8),
thr=0.2,
normalize=target_sizes)
accuracy['acc_pose'] = float(avg_acc)
return accuracy
def decode(self, img_metas, output, img_size, **kwargs):
"""Decode the predicted keypoints from prediction.
Args:
img_metas (list(dict)): Information about data augmentation
By default this includes:
- "image_file: path to the image file
- "center": center of the bbox
- "scale": scale of the bbox
- "rotation": rotation of the bbox
- "bbox_score": score of bbox
output (np.ndarray[N, K, H, W]): model predicted heatmaps.
"""
batch_size = len(img_metas)
W, H = img_size
output = output * np.array([W, H])[None, None, :] # [bs, query, 2], coordinates with recovered shapes.
if 'bbox_id' or 'query_bbox_id' in img_metas[0]:
bbox_ids = []
else:
bbox_ids = None
c = np.zeros((batch_size, 2), dtype=np.float32)
s = np.zeros((batch_size, 2), dtype=np.float32)
image_paths = []
score = np.ones(batch_size)
for i in range(batch_size):
c[i, :] = img_metas[i]['query_center']
s[i, :] = img_metas[i]['query_scale']
image_paths.append(img_metas[i]['query_image_file'])
if 'query_bbox_score' in img_metas[i]:
score[i] = np.array(
img_metas[i]['query_bbox_score']).reshape(-1)
if 'bbox_id' in img_metas[i]:
bbox_ids.append(img_metas[i]['bbox_id'])
elif 'query_bbox_id' in img_metas[i]:
bbox_ids.append(img_metas[i]['query_bbox_id'])
preds = np.zeros(output.shape)
for idx in range(output.shape[0]):
preds[i] = transform_preds(
output[i],
c[i],
s[i], [W, H],
use_udp=self.test_cfg.get('use_udp', False))
all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
all_preds[:, :, 0:2] = preds[:, :, 0:2]
all_preds[:, :, 2:3] = 1.0
all_boxes[:, 0:2] = c[:, 0:2]
all_boxes[:, 2:4] = s[:, 0:2]
all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
all_boxes[:, 5] = score
result = {}
result['preds'] = all_preds
result['boxes'] = all_boxes
result['image_paths'] = image_paths
result['bbox_ids'] = bbox_ids
return result
================================================
FILE: models/models/utils/__init__.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from .builder import build_linear_layer, build_transformer, build_backbone
from .encoder_decoder import EncoderDecoder
from .positional_encoding import (LearnedPositionalEncoding,
SinePositionalEncoding)
from .transformer import (DetrTransformerDecoderLayer, DetrTransformerDecoder,
DetrTransformerEncoder, DynamicConv)
__all__ = [
'build_transformer', 'build_backbone', 'build_linear_layer', 'DetrTransformerDecoderLayer',
'DetrTransformerDecoder', 'DetrTransformerEncoder',
'LearnedPositionalEncoding', 'SinePositionalEncoding',
'EncoderDecoder',
]
================================================
FILE: models/models/utils/builder.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import torch.nn as nn
from mmcv.utils import Registry, build_from_cfg
TRANSFORMER = Registry('Transformer')
BACKBONES = Registry('BACKBONES')
LINEAR_LAYERS = Registry('linear layers')
def build_backbone(cfg, default_args=None):
"""Build backbone."""
return build_from_cfg(cfg, BACKBONES, default_args)
def build_transformer(cfg, default_args=None):
"""Builder for Transformer."""
return build_from_cfg(cfg, TRANSFORMER, default_args)
LINEAR_LAYERS.register_module('Linear', module=nn.Linear)
def build_linear_layer(cfg, *args, **kwargs):
"""Build linear layer.
Args:
cfg (None or dict): The linear layer config, which should contain:
- type (str): Layer type.
- layer args: Args needed to instantiate an linear layer.
args (argument list): Arguments passed to the `__init__`
method of the corresponding linear layer.
kwargs (keyword arguments): Keyword arguments passed to the `__init__`
method of the corresponding linear layer.
Returns:
nn.Module: Created linear layer.
"""
if cfg is None:
cfg_ = dict(type='Linear')
else:
if not isinstance(cfg, dict):
raise TypeError('cfg must be a dict')
if 'type' not in cfg:
raise KeyError('the cfg dict must contain the key "type"')
cfg_ = cfg.copy()
layer_type = cfg_.pop('type')
if layer_type not in LINEAR_LAYERS:
raise KeyError(f'Unrecognized linear type {layer_type}')
else:
linear_layer = LINEAR_LAYERS.get(layer_type)
layer = linear_layer(*args, **kwargs, **cfg_)
return layer
================================================
FILE: models/models/utils/encoder_decoder.py
================================================
import copy
from typing import Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import xavier_init
from torch import Tensor
from models.models.utils.builder import TRANSFORMER
def inverse_sigmoid(x, eps=1e-3):
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1 / x2)
class MLP(nn.Module):
""" Very simple multi-layer perceptron (also called FFN)"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.ModuleList(
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.gelu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
class ProposalGenerator(nn.Module):
def __init__(self, hidden_dim, proj_dim, dynamic_proj_dim):
super().__init__()
self.support_proj = nn.Linear(hidden_dim, proj_dim)
self.query_proj = nn.Linear(hidden_dim, proj_dim)
self.dynamic_proj = nn.Sequential(
nn.Linear(hidden_dim, dynamic_proj_dim),
nn.ReLU(),
nn.Linear(dynamic_proj_dim, hidden_dim))
self.dynamic_act = nn.Tanh()
def forward(self, query_feat, support_feat, spatial_shape):
"""
Args:
support_feat: [query, bs, c]
query_feat: [hw, bs, c]
spatial_shape: h, w
"""
device = query_feat.device
_, bs, c = query_feat.shape
h, w = spatial_shape
side_normalizer = torch.tensor([w, h]).to(query_feat.device)[None, None,
:] # [bs, query, 2], Normalize the coord to [0,1]
query_feat = query_feat.transpose(0, 1)
support_feat = support_feat.transpose(0, 1)
nq = support_feat.shape[1]
fs_proj = self.support_proj(support_feat) # [bs, query, c]
fq_proj = self.query_proj(query_feat) # [bs, hw, c]
pattern_attention = self.dynamic_act(self.dynamic_proj(fs_proj)) # [bs, query, c]
fs_feat = (pattern_attention + 1) * fs_proj # [bs, query, c]
similarity = torch.bmm(fq_proj, fs_feat.transpose(1, 2)) # [bs, hw, query]
similarity = similarity.transpose(1, 2).reshape(bs, nq, h, w)
grid_y, grid_x = torch.meshgrid(
torch.linspace(0.5, h - 0.5, h, dtype=torch.float32, device=device), # (h, w)
torch.linspace(0.5, w - 0.5, w, dtype=torch.float32, device=device))
# compute softmax and sum up
coord_grid = torch.stack([grid_x, grid_y],
dim=0).unsqueeze(0).unsqueeze(0).repeat(bs, nq, 1, 1, 1) # [bs, query, 2, h, w]
coord_grid = coord_grid.permute(0, 1, 3, 4, 2) # [bs, query, h, w, 2]
similarity_softmax = similarity.flatten(2, 3).softmax(dim=-1) # [bs, query, hw]
similarity_coord_grid = similarity_softmax[:, :, :, None] * coord_grid.flatten(2, 3)
proposal_for_loss = similarity_coord_grid.sum(dim=2, keepdim=False) # [bs, query, 2]
proposal_for_loss = proposal_for_loss / side_normalizer
max_pos = torch.argmax(similarity.reshape(bs, nq, -1), dim=-1, keepdim=True) # (bs, nq, 1)
max_mask = F.one_hot(max_pos, num_classes=w * h) # (bs, nq, 1, w*h)
max_mask = max_mask.reshape(bs, nq, w, h).type(torch.float) # (bs, nq, w, h)
local_max_mask = F.max_pool2d(
input=max_mask, kernel_size=3, stride=1,
padding=1).reshape(bs, nq, w * h, 1) # (bs, nq, w*h, 1)
'''
proposal = (similarity_coord_grid * local_max_mask).sum(
dim=2, keepdim=False) / torch.count_nonzero(
local_max_mask, dim=2)
'''
# first, extract the local probability map with the mask
local_similarity_softmax = similarity_softmax[:, :, :, None] * local_max_mask # (bs, nq, w*h, 1)
# then, re-normalize the local probability map
local_similarity_softmax = local_similarity_softmax / (
local_similarity_softmax.sum(dim=-2, keepdim=True) + 1e-10
) # [bs, nq, w*h, 1]
# point-wise mulplication of local probability map and coord grid
proposals = local_similarity_softmax * coord_grid.flatten(2, 3) # [bs, nq, w*h, 2]
# sum the mulplication to obtain the final coord proposals
proposals = proposals.sum(dim=2) / side_normalizer # [bs, nq, 2]
return proposal_for_loss, similarity, proposals
@TRANSFORMER.register_module()
class EncoderDecoder(nn.Module):
def __init__(self,
d_model=256,
nhead=8,
num_encoder_layers=3,
num_decoder_layers=3,
graph_decoder=None,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
similarity_proj_dim=256,
dynamic_proj_dim=128,
return_intermediate_dec=True,
look_twice=False,
detach_support_feat=False):
super().__init__()
self.d_model = d_model
self.nhead = nhead
encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
activation, normalize_before)
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
decoder_layer = GraphTransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout,
activation, normalize_before, graph_decoder)
decoder_norm = nn.LayerNorm(d_model)
self.decoder = GraphTransformerDecoder(d_model, decoder_layer, num_decoder_layers, decoder_norm,
return_intermediate=return_intermediate_dec,
look_twice=look_twice, detach_support_feat=detach_support_feat)
self.proposal_generator = ProposalGenerator(
hidden_dim=d_model,
proj_dim=similarity_proj_dim,
dynamic_proj_dim=dynamic_proj_dim)
def init_weights(self):
# follow the official DETR to init parameters
for m in self.modules():
if hasattr(m, 'weight') and m.weight.dim() > 1:
xavier_init(m, distribution='uniform')
def forward(self, src, mask, support_embed, pos_embed, support_order_embed,
query_padding_mask, position_embedding, kpt_branch, skeleton, return_attn_map=False):
bs, c, h, w = src.shape
src = src.flatten(2).permute(2, 0, 1)
pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
support_order_embed = support_order_embed.flatten(2).permute(2, 0, 1)
pos_embed = torch.cat((pos_embed, support_order_embed))
query_embed = support_embed.transpose(0, 1)
mask = mask.flatten(1)
query_embed, refined_support_embed = self.encoder(
src,
query_embed,
src_key_padding_mask=mask,
query_key_padding_mask=query_padding_mask,
pos=pos_embed)
# Generate initial proposals and corresponding positional embedding.
initial_proposals_for_loss, similarity_map, initial_proposals = self.proposal_generator(
query_embed, refined_support_embed, spatial_shape=[h, w])
initial_position_embedding = position_embedding.forward_coordinates(initial_proposals)
outs_dec, out_points, attn_maps = self.decoder(
refined_support_embed,
query_embed,
memory_key_padding_mask=mask,
pos=pos_embed,
query_pos=initial_position_embedding,
tgt_key_padding_mask=query_padding_mask,
position_embedding=position_embedding,
initial_proposals=initial_proposals,
kpt_branch=kpt_branch,
skeleton=skeleton,
return_attn_map=return_attn_map)
return outs_dec.transpose(1, 2), initial_proposals_for_loss, out_points, similarity_map
class GraphTransformerDecoder(nn.Module):
def __init__(self,
d_model,
decoder_layer,
num_layers,
norm=None,
return_intermediate=False,
look_twice=False,
detach_support_feat=False):
super().__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
self.return_intermediate = return_intermediate
self.ref_point_head = MLP(d_model, d_model, d_model, num_layers=2)
self.look_twice = look_twice
self.detach_support_feat = detach_support_feat
def forward(self,
support_feat,
query_feat,
tgt_mask=None,
memory_mask=None,
tgt_key_padding_mask=None,
memory_key_padding_mask=None,
pos=None,
query_pos=None,
position_embedding=None,
initial_proposals=None,
kpt_branch=None,
skeleton=None,
return_attn_map=False):
"""
position_embedding: Class used to compute positional embedding
inital_proposals: [bs, nq, 2], normalized coordinates of inital proposals
kpt_branch: MLP used to predict the offsets for each query.
"""
refined_support_feat = support_feat
intermediate = []
attn_maps = []
bi = initial_proposals.detach()
bi_tag = initial_proposals.detach()
query_points = [initial_proposals.detach()]
tgt_key_padding_mask_remove_all_true = tgt_key_padding_mask.clone().to(tgt_key_padding_mask.device)
tgt_key_padding_mask_remove_all_true[tgt_key_padding_mask.logical_not().sum(dim=-1) == 0, 0] = False
for layer_idx, layer in enumerate(self.layers):
if layer_idx == 0: # use positional embedding form inital proposals
query_pos_embed = query_pos.transpose(0, 1)
else:
# recalculate the positional embedding
query_pos_embed = position_embedding.forward_coordinates(bi)
query_pos_embed = query_pos_embed.transpose(0, 1)
query_pos_embed = self.ref_point_head(query_pos_embed)
if self.detach_support_feat:
refined_support_feat = refined_support_feat.detach()
refined_support_feat, attn_map = layer(
refined_support_feat,
query_feat,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
tgt_key_padding_mask=tgt_key_padding_mask_remove_all_true,
memory_key_padding_mask=memory_key_padding_mask,
pos=pos,
query_pos=query_pos_embed,
skeleton=skeleton)
if self.return_intermediate:
intermediate.append(self.norm(refined_support_feat))
if return_attn_map:
attn_maps.append(attn_map)
# update the query coordinates
delta_bi = kpt_branch[layer_idx](refined_support_feat.transpose(0, 1))
# Prediction loss
if self.look_twice:
bi_pred = self.update(bi_tag, delta_bi)
bi_tag = self.update(bi, delta_bi)
else:
bi_tag = self.update(bi, delta_bi)
bi_pred = bi_tag
bi = bi_tag.detach()
query_points.append(bi_pred)
if self.norm is not None:
refined_support_feat = self.norm(refined_support_feat)
if self.return_intermediate:
intermediate.pop()
intermediate.append(refined_support_feat)
if self.return_intermediate:
return torch.stack(intermediate), query_points, attn_maps
return refined_support_feat.unsqueeze(0), query_points, attn_maps
def update(self, query_coordinates, delta_unsig):
query_coordinates_unsigmoid = inverse_sigmoid(query_coordinates)
new_query_coordinates = query_coordinates_unsigmoid + delta_unsig
new_query_coordinates = new_query_coordinates.sigmoid()
return new_query_coordinates
class GraphTransformerDecoderLayer(nn.Module):
def __init__(self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False,
graph_decoder=None):
super().__init__()
self.graph_decoder = graph_decoder
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.multihead_attn = nn.MultiheadAttention(
d_model * 2, nhead, dropout=dropout, vdim=d_model)
self.choker = nn.Linear(in_features=2 * d_model, out_features=d_model)
# Implementation of Feedforward model
if self.graph_decoder is None:
self.ffn1 = nn.Linear(d_model, dim_feedforward)
self.ffn2 = nn.Linear(dim_feedforward, d_model)
elif self.graph_decoder == 'pre':
self.ffn1 = GCNLayer(d_model, dim_feedforward, batch_first=False)
self.ffn2 = nn.Linear(dim_feedforward, d_model)
elif self.graph_decoder == 'post':
self.ffn1 = nn.Linear(d_model, dim_feedforward)
self.ffn2 = GCNLayer(dim_feedforward, d_model, batch_first=False)
else:
self.ffn1 = GCNLayer(d_model, dim_feedforward, batch_first=False)
self.ffn2 = GCNLayer(dim_feedforward, d_model, batch_first=False)
self.dropout = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward(self,
refined_support_feat,
refined_query_feat,
tgt_mask: Optional[Tensor] = None,
memory_mask: Optional[Tensor] = None,
tgt_key_padding_mask: Optional[Tensor] = None,
memory_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None,
query_pos: Optional[Tensor] = None,
skeleton: Optional[list] = None):
q = k = self.with_pos_embed(refined_support_feat, query_pos + pos[refined_query_feat.shape[0]:])
tgt2 = self.self_attn(
q,
k,
value=refined_support_feat,
attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask)[0]
refined_support_feat = refined_support_feat + self.dropout1(tgt2)
refined_support_feat = self.norm1(refined_support_feat)
# concatenate the positional embedding with the content feature, instead of direct addition
cross_attn_q = torch.cat((refined_support_feat, query_pos + pos[refined_query_feat.shape[0]:]), dim=-1)
cross_attn_k = torch.cat((refined_query_feat, pos[:refined_query_feat.shape[0]]), dim=-1)
tgt2, attn_map = self.multihead_attn(
query=cross_attn_q,
key=cross_attn_k,
value=refined_query_feat,
attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask)
refined_support_feat = refined_support_feat + self.dropout2(self.choker(tgt2))
refined_support_feat = self.norm2(refined_support_feat)
if self.graph_decoder is not None:
num_pts, b, c = refined_support_feat.shape
adj = adj_from_skeleton(num_pts=num_pts,
skeleton=skeleton,
mask=tgt_key_padding_mask,
device=refined_support_feat.device)
if self.graph_decoder == 'pre':
tgt2 = self.ffn2(self.dropout(self.activation(self.ffn1(refined_support_feat, adj))))
elif self.graph_decoder == 'post':
tgt2 = self.ffn2(self.dropout(self.activation(self.ffn1(refined_support_feat))), adj)
else:
tgt2 = self.ffn2(self.dropout(self.activation(self.ffn1(refined_support_feat, adj))), adj)
else:
tgt2 = self.ffn2(self.dropout(self.activation(self.ffn1(refined_support_feat))))
refined_support_feat = refined_support_feat + self.dropout3(tgt2)
refined_support_feat = self.norm3(refined_support_feat)
return refined_support_feat, attn_map
class TransformerEncoder(nn.Module):
def __init__(self, encoder_layer, num_layers, norm=None):
super().__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
def forward(self,
src,
query,
mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
query_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
# src: [hw, bs, c]
# query: [num_query, bs, c]
# mask: None by default
# src_key_padding_mask: [bs, hw]
# query_key_padding_mask: [bs, nq]
# pos: [hw, bs, c]
# organize the input
# implement the attention mask to mask out the useless points
n, bs, c = src.shape
src_cat = torch.cat((src, query), dim=0) # [hw + nq, bs, c]
mask_cat = torch.cat((src_key_padding_mask, query_key_padding_mask),
dim=1) # [bs, hw+nq]
output = src_cat
for layer in self.layers:
output = layer(
output,
query_length=n,
src_mask=mask,
src_key_padding_mask=mask_cat,
pos=pos)
if self.norm is not None:
output = self.norm(output)
# resplit the output into src and query
refined_query = output[n:, :, :] # [nq, bs, c]
output = output[:n, :, :] # [n, bs, c]
return output, refined_query
class TransformerEncoderLayer(nn.Module):
def __init__(self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
normalize_before=False):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward(self,
src,
query_length,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
src = self.with_pos_embed(src, pos)
q = k = src
# NOTE: compared with original implementation, we add positional embedding into the VALUE.
src2 = self.self_attn(
q,
k,
value=src,
attn_mask=src_mask,
key_padding_mask=src_key_padding_mask)[0]
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = src + self.dropout2(src2)
src = self.norm2(src)
return src
def adj_from_skeleton(num_pts, skeleton, mask, device='cuda'):
adj_mx = torch.empty(0, device=device)
batch_size = len(skeleton)
for b in range(batch_size):
edges = torch.tensor(skeleton[b])
adj = torch.zeros(num_pts, num_pts, device=device)
adj[edges[:, 0], edges[:, 1]] = 1
adj_mx = torch.concatenate((adj_mx, adj.unsqueeze(0)), dim=0)
trans_adj_mx = torch.transpose(adj_mx, 1, 2)
cond = (trans_adj_mx > adj_mx).float()
adj = adj_mx + trans_adj_mx * cond - adj_mx * cond
adj = adj * ~mask[..., None] * ~mask[:, None]
adj = torch.nan_to_num(adj / adj.sum(dim=-1, keepdim=True))
adj = torch.stack((torch.diag_embed(~mask), adj), dim=1)
return adj
class GCNLayer(nn.Module):
def __init__(self,
in_features,
out_features,
kernel_size=2,
use_bias=True,
activation=nn.ReLU(inplace=True),
batch_first=True):
super(GCNLayer, self).__init__()
self.conv = nn.Conv1d(in_features, out_features * kernel_size, kernel_size=1,
padding=0, stride=1, dilation=1, bias=use_bias)
self.kernel_size = kernel_size
self.activation = activation
self.batch_first = batch_first
def forward(self, x, adj):
assert adj.size(1) == self.kernel_size
if not self.batch_first:
x = x.permute(1, 2, 0)
else:
x = x.transpose(1, 2)
x = self.conv(x)
b, kc, v = x.size()
x = x.view(b, self.kernel_size, kc // self.kernel_size, v)
x = torch.einsum('bkcv,bkvw->bcw', (x, adj))
if self.activation is not None:
x = self.activation(x)
if not self.batch_first:
x = x.permute(2, 0, 1)
else:
x = x.transpose(1, 2)
return x
def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
def clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
================================================
FILE: models/models/utils/positional_encoding.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import math
import torch
import torch.nn as nn
from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
from mmcv.runner import BaseModule
# TODO: add an SinePositionalEncoding for coordinates input
@POSITIONAL_ENCODING.register_module()
class SinePositionalEncoding(BaseModule):
"""Position encoding with sine and cosine functions.
See `End-to-End Object Detection with Transformers
`_ for details.
Args:
num_feats (int): The feature dimension for each position
along x-axis or y-axis. Note the final returned dimension
for each position is 2 times of this value.
temperature (int, optional): The temperature used for scaling
the position embedding. Defaults to 10000.
normalize (bool, optional): Whether to normalize the position
embedding. Defaults to False.
scale (float, optional): A scale factor that scales the position
embedding. The scale will be used only when `normalize` is True.
Defaults to 2*pi.
eps (float, optional): A value added to the denominator for
numerical stability. Defaults to 1e-6.
offset (float): offset add to embed when do the normalization.
Defaults to 0.
init_cfg (dict or list[dict], optional): Initialization config dict.
Default: None
"""
def __init__(self,
num_feats,
temperature=10000,
normalize=False,
scale=2 * math.pi,
eps=1e-6,
offset=0.,
init_cfg=None):
super(SinePositionalEncoding, self).__init__(init_cfg)
if normalize:
assert isinstance(scale, (float, int)), 'when normalize is set,' \
'scale should be provided and in float or int type, ' \
f'found {type(scale)}'
self.num_feats = num_feats
self.temperature = temperature
self.normalize = normalize
self.scale = scale
self.eps = eps
self.offset = offset
def forward(self, mask):
"""Forward function for `SinePositionalEncoding`.
Args:
mask (Tensor): ByteTensor mask. Non-zero values representing
ignored positions, while zero values means valid positions
for this image. Shape [bs, h, w].
Returns:
pos (Tensor): Returned position embedding with shape
[bs, num_feats*2, h, w].
"""
# For convenience of exporting to ONNX, it's required to convert
# `masks` from bool to int.
mask = mask.to(torch.int)
not_mask = 1 - mask # logical_not
y_embed = not_mask.cumsum(1, dtype=torch.float32) # [bs, h, w], recording the y coordinate ot each pixel
x_embed = not_mask.cumsum(2, dtype=torch.float32)
if self.normalize: # default True
y_embed = (y_embed + self.offset) / \
(y_embed[:, -1:, :] + self.eps) * self.scale
x_embed = (x_embed + self.offset) / \
(x_embed[:, :, -1:] + self.eps) * self.scale
dim_t = torch.arange(
self.num_feats, dtype=torch.float32, device=mask.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_feats)
pos_x = x_embed[:, :, :, None] / dim_t # [bs, h, w, num_feats]
pos_y = y_embed[:, :, :, None] / dim_t
# use `view` instead of `flatten` for dynamically exporting to ONNX
B, H, W = mask.size()
pos_x = torch.stack(
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
dim=4).view(B, H, W, -1) # [bs, h, w, num_feats]
pos_y = torch.stack(
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
dim=4).view(B, H, W, -1)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
def forward_coordinates(self, coord):
"""
Forward funtion for normalized coordinates input with the shape of [bs, kpt, 2]
return:
pos (Tensor): position embedding with the shape of [bs, kpt, num_feats*2]
"""
x_embed, y_embed = coord[:, :, 0], coord[:, :, 1] # [bs, kpt]
x_embed = x_embed * self.scale # [bs, kpt]
y_embed = y_embed * self.scale
dim_t = torch.arange(
self.num_feats, dtype=torch.float32, device=coord.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_feats)
pos_x = x_embed[:, :, None] / dim_t # [bs, kpt, num_feats]
pos_y = y_embed[:, :, None] / dim_t # [bs, kpt, num_feats]
bs, kpt, _ = pos_x.shape
pos_x = torch.stack(
(pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()),
dim=3).view(bs, kpt, -1) # [bs, kpt, num_feats]
pos_y = torch.stack(
(pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()),
dim=3).view(bs, kpt, -1) # [bs, kpt, num_feats]
pos = torch.cat((pos_y, pos_x), dim=2) # [bs, kpt, num_feats * 2]
return pos
def __repr__(self):
"""str: a string that describes the module"""
repr_str = self.__class__.__name__
repr_str += f'(num_feats={self.num_feats}, '
repr_str += f'temperature={self.temperature}, '
repr_str += f'normalize={self.normalize}, '
repr_str += f'scale={self.scale}, '
repr_str += f'eps={self.eps})'
return repr_str
@POSITIONAL_ENCODING.register_module()
class LearnedPositionalEncoding(BaseModule):
"""Position embedding with learnable embedding weights.
Args:
num_feats (int): The feature dimension for each position
along x-axis or y-axis. The final returned dimension for
each position is 2 times of this value.
row_num_embed (int, optional): The dictionary size of row embeddings.
Default 50.
col_num_embed (int, optional): The dictionary size of col embeddings.
Default 50.
init_cfg (dict or list[dict], optional): Initialization config dict.
"""
def __init__(self,
num_feats,
row_num_embed=50,
col_num_embed=50,
init_cfg=dict(type='Uniform', layer='Embedding')):
super(LearnedPositionalEncoding, self).__init__(init_cfg)
self.row_embed = nn.Embedding(row_num_embed, num_feats)
self.col_embed = nn.Embedding(col_num_embed, num_feats)
self.num_feats = num_feats
self.row_num_embed = row_num_embed
self.col_num_embed = col_num_embed
def forward(self, mask):
"""Forward function for `LearnedPositionalEncoding`.
Args:
mask (Tensor): ByteTensor mask. Non-zero values representing
ignored positions, while zero values means valid positions
for this image. Shape [bs, h, w].
Returns:
pos (Tensor): Returned position embedding with shape
[bs, num_feats*2, h, w].
"""
h, w = mask.shape[-2:]
x = torch.arange(w, device=mask.device)
y = torch.arange(h, device=mask.device)
x_embed = self.col_embed(x)
y_embed = self.row_embed(y)
pos = torch.cat(
(x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
1, w, 1)),
dim=-1).permute(2, 0,
1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
return pos
def __repr__(self):
"""str: a string that describes the module"""
repr_str = self.__class__.__name__
repr_str += f'(num_feats={self.num_feats}, '
repr_str += f'row_num_embed={self.row_num_embed}, '
repr_str += f'col_num_embed={self.col_num_embed})'
return repr_str
================================================
FILE: models/models/utils/transformer.py
================================================
import torch
import torch.nn as nn
from models.models.utils.builder import TRANSFORMER
from mmcv.cnn import (build_activation_layer, build_norm_layer, xavier_init)
from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER,
TRANSFORMER_LAYER_SEQUENCE)
from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
TransformerLayerSequence,
build_transformer_layer_sequence)
from mmcv.runner.base_module import BaseModule
@TRANSFORMER.register_module()
class Transformer(BaseModule):
"""Implements the DETR transformer.
Following the official DETR implementation, this module copy-paste
from torch.nn.Transformer with modifications:
* positional encodings are passed in MultiheadAttention
* extra LN at the end of encoder is removed
* decoder returns a stack of activations from all decoding layers
See `paper: End-to-End Object Detection with Transformers
`_ for details.
Args:
encoder (`mmcv.ConfigDict` | Dict): Config of
TransformerEncoder. Defaults to None.
decoder ((`mmcv.ConfigDict` | Dict)): Config of
TransformerDecoder. Defaults to None
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Defaults to None.
"""
def __init__(self, encoder=None, decoder=None, init_cfg=None):
super(Transformer, self).__init__(init_cfg=init_cfg)
self.encoder = build_transformer_layer_sequence(encoder)
self.decoder = build_transformer_layer_sequence(decoder)
self.embed_dims = self.encoder.embed_dims
def init_weights(self):
# follow the official DETR to init parameters
for m in self.modules():
if hasattr(m, 'weight') and m.weight.dim() > 1:
xavier_init(m, distribution='uniform')
self._is_init = True
def forward(self, x, mask, query_embed, pos_embed, mask_query):
"""Forward function for `Transformer`.
Args:
x (Tensor): Input query with shape [bs, c, h, w] where
c = embed_dims.
mask (Tensor): The key_padding_mask used for encoder and decoder,
with shape [bs, h, w].
query_embed (Tensor): The query embedding for decoder, with shape
[num_query, c].
pos_embed (Tensor): The positional encoding for encoder and
decoder, with the same shape as `x`.
Returns:
tuple[Tensor]: results of decoder containing the following tensor.
- out_dec: Output from decoder. If return_intermediate_dec \
is True output has shape [num_dec_layers, bs,
num_query, embed_dims], else has shape [1, bs, \
num_query, embed_dims].
- memory: Output results from encoder, with shape \
[bs, embed_dims, h, w].
Notes:
x: query image features with shape [bs, c, h, w]
mask: mask for x with shape [bs, h, w]
pos_embed: positional embedding for x with shape [bs, c, h, w]
query_embed: sample keypoint features with shape [bs, num_query, c]
mask_query: mask for query_embed with shape [bs, num_query]
Outputs:
out_dec: [num_layers, bs, num_query, c]
memory: [bs, c, h, w]
"""
bs, c, h, w = x.shape
# use `view` instead of `flatten` for dynamically exporting to ONNX
x = x.view(bs, c, -1).permute(2, 0, 1) # [bs, c, h, w] -> [h*w, bs, c]
mask = mask.view(bs,
-1) # [bs, h, w] -> [bs, h*w] Note: this mask should be filled with False, since all images are with the same shape.
pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1) # positional embeding for memory, i.e., the query.
memory = self.encoder(
query=x,
key=None,
value=None,
query_pos=pos_embed,
query_key_padding_mask=mask) # output memory: [hw, bs, c]
query_embed = query_embed.permute(1, 0, 2) # [bs, num_query, c] -> [num_query, bs, c]
# target = torch.zeros_like(query_embed)
# out_dec: [num_layers, num_query, bs, c]
out_dec = self.decoder(
query=query_embed,
key=memory,
value=memory,
key_pos=pos_embed,
# query_pos=query_embed,
query_key_padding_mask=mask_query,
key_padding_mask=mask)
out_dec = out_dec.transpose(1, 2) # [decoder_layer, bs, num_query, c]
memory = memory.permute(1, 2, 0).reshape(bs, c, h, w)
return out_dec, memory
@TRANSFORMER_LAYER.register_module()
class DetrTransformerDecoderLayer(BaseTransformerLayer):
"""Implements decoder layer in DETR transformer.
Args:
attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
Configs for self_attention or cross_attention, the order
should be consistent with it in `operation_order`. If it is
a dict, it would be expand to the number of attention in
`operation_order`.
feedforward_channels (int): The hidden dimension for FFNs.
ffn_dropout (float): Probability of an element to be zeroed
in ffn. Default 0.0.
operation_order (tuple[str]): The execution order of operation
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
Default:None
act_cfg (dict): The activation config for FFNs. Default: `LN`
norm_cfg (dict): Config dict for normalization layer.
Default: `LN`.
ffn_num_fcs (int): The number of fully-connected layers in FFNs.
Default:2.
"""
def __init__(self,
attn_cfgs,
feedforward_channels,
ffn_dropout=0.0,
operation_order=None,
act_cfg=dict(type='ReLU', inplace=True),
norm_cfg=dict(type='LN'),
ffn_num_fcs=2,
**kwargs):
super(DetrTransformerDecoderLayer, self).__init__(
attn_cfgs=attn_cfgs,
feedforward_channels=feedforward_channels,
ffn_dropout=ffn_dropout,
operation_order=operation_order,
act_cfg=act_cfg,
norm_cfg=norm_cfg,
ffn_num_fcs=ffn_num_fcs,
**kwargs)
# assert len(operation_order) == 6
# assert set(operation_order) == set(
# ['self_attn', 'norm', 'cross_attn', 'ffn'])
@TRANSFORMER_LAYER_SEQUENCE.register_module()
class DetrTransformerEncoder(TransformerLayerSequence):
"""TransformerEncoder of DETR.
Args:
post_norm_cfg (dict): Config of last normalization layer. Default:
`LN`. Only used when `self.pre_norm` is `True`
"""
def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs):
super(DetrTransformerEncoder, self).__init__(*args, **kwargs)
if post_norm_cfg is not None:
self.post_norm = build_norm_layer(
post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None
else:
# assert not self.pre_norm, f'Use prenorm in ' \
# f'{self.__class__.__name__},' \
# f'Please specify post_norm_cfg'
self.post_norm = None
def forward(self, *args, **kwargs):
"""Forward function for `TransformerCoder`.
Returns:
Tensor: forwarded results with shape [num_query, bs, embed_dims].
"""
x = super(DetrTransformerEncoder, self).forward(*args, **kwargs)
if self.post_norm is not None:
x = self.post_norm(x)
return x
@TRANSFORMER_LAYER_SEQUENCE.register_module()
class DetrTransformerDecoder(TransformerLayerSequence):
"""Implements the decoder in DETR transformer.
Args:
return_intermediate (bool): Whether to return intermediate outputs.
post_norm_cfg (dict): Config of last normalization layer. Default:
`LN`.
"""
def __init__(self,
*args,
post_norm_cfg=dict(type='LN'),
return_intermediate=False,
**kwargs):
super(DetrTransformerDecoder, self).__init__(*args, **kwargs)
self.return_intermediate = return_intermediate
if post_norm_cfg is not None:
self.post_norm = build_norm_layer(post_norm_cfg,
self.embed_dims)[1]
else:
self.post_norm = None
def forward(self, query, *args, **kwargs):
"""Forward function for `TransformerDecoder`.
Args:
query (Tensor): Input query with shape
`(num_query, bs, embed_dims)`.
Returns:
Tensor: Results with shape [1, num_query, bs, embed_dims] when
return_intermediate is `False`, otherwise it has shape
[num_layers, num_query, bs, embed_dims].
"""
if not self.return_intermediate:
x = super().forward(query, *args, **kwargs)
if self.post_norm:
x = self.post_norm(x)[None]
return x
intermediate = []
for layer in self.layers:
query = layer(query, *args, **kwargs)
if self.return_intermediate:
if self.post_norm is not None:
intermediate.append(self.post_norm(query))
else:
intermediate.append(query)
return torch.stack(intermediate)
@TRANSFORMER.register_module()
class DynamicConv(BaseModule):
"""Implements Dynamic Convolution.
This module generate parameters for each sample and
use bmm to implement 1*1 convolution. Code is modified
from the `official github repo `_ .
Args:
in_channels (int): The input feature channel.
Defaults to 256.
feat_channels (int): The inner feature channel.
Defaults to 64.
out_channels (int, optional): The output feature channel.
When not specified, it will be set to `in_channels`
by default
input_feat_shape (int): The shape of input feature.
Defaults to 7.
with_proj (bool): Project two-dimentional feature to
one-dimentional feature. Default to True.
act_cfg (dict): The activation config for DynamicConv.
norm_cfg (dict): Config dict for normalization layer. Default
layer normalization.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
def __init__(self,
in_channels=256,
feat_channels=64,
out_channels=None,
input_feat_shape=7,
with_proj=True,
act_cfg=dict(type='ReLU', inplace=True),
norm_cfg=dict(type='LN'),
init_cfg=None):
super(DynamicConv, self).__init__(init_cfg)
self.in_channels = in_channels
self.feat_channels = feat_channels
self.out_channels_raw = out_channels
self.input_feat_shape = input_feat_shape
self.with_proj = with_proj
self.act_cfg = act_cfg
self.norm_cfg = norm_cfg
self.out_channels = out_channels if out_channels else in_channels
self.num_params_in = self.in_channels * self.feat_channels
self.num_params_out = self.out_channels * self.feat_channels
self.dynamic_layer = nn.Linear(
self.in_channels, self.num_params_in + self.num_params_out)
self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
self.norm_out = build_norm_layer(norm_cfg, self.out_channels)[1]
self.activation = build_activation_layer(act_cfg)
num_output = self.out_channels * input_feat_shape ** 2
if self.with_proj:
self.fc_layer = nn.Linear(num_output, self.out_channels)
self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
def forward(self, param_feature, input_feature):
"""Forward function for `DynamicConv`.
Args:
param_feature (Tensor): The feature can be used
to generate the parameter, has shape
(num_all_proposals, in_channels).
input_feature (Tensor): Feature that
interact with parameters, has shape
(num_all_proposals, in_channels, H, W).
Returns:
Tensor: The output feature has shape
(num_all_proposals, out_channels).
"""
input_feature = input_feature.flatten(2).permute(2, 0, 1)
input_feature = input_feature.permute(1, 0, 2)
parameters = self.dynamic_layer(param_feature)
param_in = parameters[:, :self.num_params_in].view(
-1, self.in_channels, self.feat_channels)
param_out = parameters[:, -self.num_params_out:].view(
-1, self.feat_channels, self.out_channels)
# input_feature has shape (num_all_proposals, H*W, in_channels)
# param_in has shape (num_all_proposals, in_channels, feat_channels)
# feature has shape (num_all_proposals, H*W, feat_channels)
features = torch.bmm(input_feature, param_in)
features = self.norm_in(features)
features = self.activation(features)
# param_out has shape (batch_size, feat_channels, out_channels)
features = torch.bmm(features, param_out)
features = self.norm_out(features)
features = self.activation(features)
if self.with_proj:
features = features.flatten(1)
features = self.fc_layer(features)
features = self.fc_norm(features)
features = self.activation(features)
return features
================================================
FILE: models/version.py
================================================
# GENERATED VERSION FILE
# TIME: Tue Dec 19 17:01:21 2023
__version__ = '0.2.0+f65cb07'
short_version = '0.2.0'
version_info = (0, 2, 0)
================================================
FILE: requirements.txt
================================================
json_tricks
numpy
opencv-python
pillow==6.2.2
xtcocotools
scipy
================================================
FILE: setup.cfg
================================================
[bdist_wheel]
universal=1
[aliases]
test=pytest
[tool:pytest]
addopts=tests/
[yapf]
based_on_style = pep8
blank_line_before_nested_class_or_def = true
split_before_expression_after_opening_paren = true
[isort]
line_length = 79
multi_line_output = 0
known_standard_library = pkg_resources,setuptools
known_first_party = mmpose
known_third_party = cv2,json_tricks,mmcv,mmdet,munkres,numpy,xtcocotools,torch
no_lines_before = STDLIB,LOCALFOLDER
default_section = THIRDPARTY
================================================
FILE: setup.py
================================================
import os
import subprocess
import time
from setuptools import find_packages, setup
def readme():
with open('README.md', encoding='utf-8') as f:
content = f.read()
return content
version_file = 'models/version.py'
def get_git_hash():
def _minimal_ext_cmd(cmd):
# construct minimal environment
env = {}
for k in ['SYSTEMROOT', 'PATH', 'HOME']:
v = os.environ.get(k)
if v is not None:
env[k] = v
# LANGUAGE is used on win32
env['LANGUAGE'] = 'C'
env['LANG'] = 'C'
env['LC_ALL'] = 'C'
out = subprocess.Popen(
cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
return out
try:
out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
sha = out.strip().decode('ascii')
except OSError:
sha = 'unknown'
return sha
def get_hash():
if os.path.exists('.git'):
sha = get_git_hash()[:7]
elif os.path.exists(version_file):
try:
from models.version import __version__
sha = __version__.split('+')[-1]
except ImportError:
raise ImportError('Unable to get git version')
else:
sha = 'unknown'
return sha
def write_version_py():
content = """# GENERATED VERSION FILE
# TIME: {}
__version__ = '{}'
short_version = '{}'
version_info = ({})
"""
sha = get_hash()
with open('models/VERSION', 'r') as f:
SHORT_VERSION = f.read().strip()
VERSION_INFO = ', '.join(SHORT_VERSION.split('.'))
VERSION = SHORT_VERSION + '+' + sha
version_file_str = content.format(time.asctime(), VERSION, SHORT_VERSION,
VERSION_INFO)
with open(version_file, 'w') as f:
f.write(version_file_str)
def get_version():
with open(version_file, 'r') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__version__']
def get_requirements(filename='requirements.txt'):
here = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(here, filename), 'r') as f:
requires = [line.replace('\n', '') for line in f.readlines()]
return requires
if __name__ == '__main__':
write_version_py()
setup(
name='pose_anything',
version=get_version(),
description='A template for pytorch projects.',
long_description=readme(),
packages=find_packages(exclude=('configs', 'tools', 'demo')),
package_data={'pose_anything.ops': ['*/*.so']},
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
],
license='Apache License 2.0',
setup_requires=['pytest-runner', 'cython', 'numpy'],
tests_require=['pytest', 'xdoctest'],
install_requires=get_requirements(),
zip_safe=False)
================================================
FILE: test.py
================================================
import argparse
import os
import os.path as osp
import random
import uuid
import mmcv
import numpy as np
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import get_dist_info, init_dist, load_checkpoint
from models import * # noqa
from models.datasets import build_dataset
from mmpose.apis import multi_gpu_test, single_gpu_test
from mmpose.core import wrap_fp16_model
from mmpose.datasets import build_dataloader
from mmpose.models import build_posenet
def parse_args():
parser = argparse.ArgumentParser(description='mmpose test model')
parser.add_argument('config', default=None, help='test config file path')
parser.add_argument('checkpoint', default=None, help='checkpoint file')
parser.add_argument('--out', help='output result file')
parser.add_argument(
'--fuse-conv-bn',
action='store_true',
help='Whether to fuse conv and bn, this will slightly increase the inference speed')
parser.add_argument(
'--eval',
default=None,
nargs='+',
help='evaluation metric, which depends on the dataset,'
' e.g., "mAP" for MSCOCO')
parser.add_argument(
'--permute_keypoints',
action='store_true',
help='whether to randomly permute keypoints')
parser.add_argument(
'--gpu_collect',
action='store_true',
help='whether to use gpu to collect results')
parser.add_argument('--tmpdir', help='tmp dir for writing some results')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def merge_configs(cfg1, cfg2):
# Merge cfg2 into cfg1
# Overwrite cfg1 if repeated, ignore if value is None.
cfg1 = {} if cfg1 is None else cfg1.copy()
cfg2 = {} if cfg2 is None else cfg2
for k, v in cfg2.items():
if v:
cfg1[k] = v
return cfg1
def main():
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
uuid.UUID(int=0)
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
# cfg.model.pretrained = None
cfg.data.test.test_mode = True
args.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
# build the dataloader
dataset = build_dataset(cfg.data.test, dict(test_mode=True))
dataloader_setting = dict(
samples_per_gpu=1,
workers_per_gpu=cfg.data.get('workers_per_gpu', 12),
dist=distributed,
shuffle=False,
drop_last=False)
dataloader_setting = dict(dataloader_setting,
**cfg.data.get('test_dataloader', {}))
data_loader = build_dataloader(dataset, **dataloader_setting)
# build the model and load checkpoint
model = build_posenet(cfg.model)
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
load_checkpoint(model, args.checkpoint, map_location='cpu')
if args.fuse_conv_bn:
model = fuse_conv_bn(model)
if not distributed:
model = MMDataParallel(model, device_ids=[0])
outputs = single_gpu_test(model, data_loader)
else:
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False)
outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect)
rank, _ = get_dist_info()
eval_config = cfg.get('evaluation', {})
eval_config = merge_configs(eval_config, dict(metric=args.eval))
if rank == 0:
if args.out:
print(f'\nwriting results to {args.out}')
mmcv.dump(outputs, args.out)
results = dataset.evaluate(outputs, **eval_config)
print('\n')
for k, v in sorted(results.items()):
print(f'{k}: {v}')
# save testing log
test_log = "./work_dirs/testing_log.txt"
with open(test_log, 'a') as f:
f.write("** config_file: " + args.config + "\t checkpoint: " + args.checkpoint + "\t \n")
for k, v in sorted(results.items()):
f.write(f'\t {k}: {v}'+'\n')
f.write("********************************************************************\n")
if __name__ == '__main__':
main()
================================================
FILE: tools/dist_test.sh
================================================
#!/usr/bin/env bash
# Copyright (c) OpenMMLab. All rights reserved.
CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-29000}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
$(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
================================================
FILE: tools/dist_train.sh
================================================
#!/usr/bin/env bash
# Copyright (c) OpenMMLab. All rights reserved.
CONFIG=$1
GPUS=$2
OUTPUT_DIR=$3
PORT=${PORT:-29000}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
$(dirname "$0")/train.py $CONFIG --work-dir $OUTPUT_DIR --launcher pytorch ${@:3}
================================================
FILE: tools/fix_carfuxion.py
================================================
import json
import os
import shutil
import sys
import numpy as np
from xtcocotools.coco import COCO
def search_match(bbox, num_keypoints, segmentation):
found = []
checked = 0
for json_file, coco in COCO_DICT.items():
cat_ids = coco.getCatIds()
for cat_id in cat_ids:
img_ids = coco.getImgIds(catIds=cat_id)
for img_id in img_ids:
annotations = coco.loadAnns(coco.getAnnIds(imgIds=img_id, catIds=cat_id))
for ann in annotations:
checked += 1
if (ann['num_keypoints'] == num_keypoints and ann['bbox'] == bbox and ann[
'segmentation'] == segmentation):
src_file = coco.loadImgs(img_id)[0]["file_name"]
split = "test" if "test" in json_file else "train"
found.append((src_file, ann, split))
# return src_file, ann, split
if len(found) == 0:
raise Exception("No match found out of {} images".format(checked))
elif len(found) > 1:
raise Exception("More than one match! ".format(found))
return found[0]
if __name__ == "__main__":
carfusion_dir_path = sys.argv[1]
mp100_dataset_path = sys.argv[2]
os.makedirs('output', exist_ok=True)
for cat in ['car', 'bus', 'suv']:
os.makedirs(os.path.join('output', cat), exist_ok=True)
COCO_DICT = {}
ann_files = os.path.join(carfusion_dir_path, 'annotations')
for json_file in os.listdir(ann_files):
COCO_DICT[json_file] = COCO(os.path.join(carfusion_dir_path, 'annotations', json_file))
count = 0
print_log = []
for json_file in os.listdir(mp100_dataset_path):
print("Processing {}".format(json_file))
cats = {}
coco = COCO(os.path.join(mp100_dataset_path, json_file))
cat_ids = coco.getCatIds()
for cat_id in cat_ids:
category_info = coco.loadCats(cat_id)
cat_name = category_info[0]['name']
if cat_name in ['car', 'bus', 'suv']:
cats[cat_name] = cat_id
for cat_name, cat_id in cats.items():
img_ids = coco.getImgIds(catIds=cat_id)
count += len(img_ids)
print_log.append(f'{json_file} : {cat_name}: {len(img_ids)}')
for img_id in img_ids:
img = coco.loadImgs(img_id)[0]
dst_file_name = img['file_name']
annotation = coco.loadAnns(coco.getAnnIds(imgIds=img_id, catIds=cat_id, iscrowd=None))
bbox = annotation[0]['bbox']
keypoints = annotation[0]['keypoints']
segmentation = annotation[0]['segmentation']
num_keypoints = annotation[0]['num_keypoints']
# Search for a match:
src_img, src_ann, split = search_match(bbox, num_keypoints, segmentation)
shutil.copyfile(
os.path.join(carfusion_dir_path, split, src_img),
os.path.join('output', dst_file_name))
================================================
FILE: tools/slurm_test.sh
================================================
#!/usr/bin/env bash
set -x
PARTITION=$1
JOB_NAME=$2
CONFIG=$3
CHECKPOINT=$4
GPUS=${GPUS:-8}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
PY_ARGS=${@:5}
SRUN_ARGS=${SRUN_ARGS:-""}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
--job-name=${JOB_NAME} \
--gres=gpu:${GPUS_PER_NODE} \
--ntasks=${GPUS} \
--ntasks-per-node=${GPUS_PER_NODE} \
--cpus-per-task=${CPUS_PER_TASK} \
--kill-on-bad-exit=1 \
${SRUN_ARGS} \
python -u test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
================================================
FILE: tools/slurm_train.sh
================================================
#!/usr/bin/env bash
set -x
PARTITION=$1
JOB_NAME=$2
CONFIG=$3
WORK_DIR=$4
GPUS=${GPUS:-8}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
PY_ARGS=${@:5}
SRUN_ARGS=${SRUN_ARGS:-""}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
--job-name=${JOB_NAME} \
--gres=gpu:${GPUS_PER_NODE} \
--ntasks=${GPUS} \
--ntasks-per-node=${GPUS_PER_NODE} \
--cpus-per-task=${CPUS_PER_TASK} \
--kill-on-bad-exit=1 \
${SRUN_ARGS} \
python -u train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS}
================================================
FILE: tools/visualization.py
================================================
import os
import random
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
import uuid
colors = [
[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
[85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
[0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
[255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]]
def plot_results(support_img, query_img, support_kp, support_w, query_kp, query_w, skeleton,
initial_proposals, prediction, radius=6, out_dir='./heatmaps'):
img_names = [img.split("_")[0] for img in os.listdir(out_dir) if str_is_int(img.split("_")[0])]
if len(img_names) > 0:
name_idx = max([int(img_name) for img_name in img_names]) + 1
else:
name_idx = 0
h, w, c = support_img.shape
prediction = prediction[-1].cpu().numpy() * h
support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img))
query_img = (query_img - np.min(query_img)) / (np.max(query_img) - np.min(query_img))
for id, (img, w, keypoint) in enumerate(zip([support_img, query_img],
[support_w, query_w],
[support_kp, prediction])):
f, axes = plt.subplots()
plt.imshow(img)
for k in range(keypoint.shape[0]):
if w[k] > 0:
kp = keypoint[k, :2]
c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6)
patch = plt.Circle(kp, radius, color=c)
axes.add_patch(patch)
axes.text(kp[0], kp[1], k)
plt.draw()
for l, limb in enumerate(skeleton):
kp = keypoint[:, :2]
if l > len(colors) - 1:
c = [x / 255 for x in random.sample(range(0, 255), 3)]
else:
c = [x / 255 for x in colors[l]]
if w[limb[0]] > 0 and w[limb[1]] > 0:
patch = plt.Line2D([kp[limb[0], 0], kp[limb[1], 0]],
[kp[limb[0], 1], kp[limb[1], 1]],
linewidth=6, color=c, alpha=0.6)
axes.add_artist(patch)
plt.axis('off') # command for hiding the axis.
name = 'support' if id == 0 else 'query'
plt.savefig(f'./{out_dir}/{str(name_idx)}_{str(name)}.png', bbox_inches='tight', pad_inches=0)
if id == 1:
plt.show()
plt.clf()
plt.close('all')
def str_is_int(s):
try:
int(s)
return True
except ValueError:
return False
================================================
FILE: train.py
================================================
import argparse
import copy
import os
import os.path as osp
import time
import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist, set_random_seed
from mmcv.utils import get_git_hash
from models import * # noqa
from models.apis import train_model
from models.datasets import build_dataset
from mmpose import __version__
from mmpose.models import build_posenet
from mmpose.utils import collect_env, get_root_logger
def parse_args():
parser = argparse.ArgumentParser(description='Train a pose model')
parser.add_argument('--config', default=None, help='train config file path')
parser.add_argument('--work-dir', default=None, help='the dir to save logs and models')
parser.add_argument(
'--resume-from', help='the checkpoint file to resume from')
parser.add_argument(
'--auto-resume', type=bool, default=True, help='automatically detect the latest checkpoint in word dir and resume from it.')
parser.add_argument(
'--no-validate',
action='store_true',
help='whether not to evaluate the checkpoint during training')
group_gpus = parser.add_mutually_exclusive_group()
group_gpus.add_argument(
'--gpus',
type=int,
help='number of gpus to use '
'(only applicable to non-distributed training)')
group_gpus.add_argument(
'--gpu-ids',
type=int,
nargs='+',
help='ids of gpus to use '
'(only applicable to non-distributed training)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument(
'--deterministic',
action='store_true',
help='whether to set deterministic options for CUDNN backend.')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
parser.add_argument(
'--autoscale-lr',
action='store_true',
help='automatically scale lr with the number of gpus')
parser.add_argument(
'--show',
action='store_true',
help='whether to display the prediction results in a window.')
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
# work_dir is determined in this priority: CLI
# > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif cfg.get('work_dir', None) is None:
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
# auto resume
if args.auto_resume:
checkpoint = os.path.join(args.work_dir, 'latest.pth')
if os.path.exists(checkpoint):
cfg.resume_from = checkpoint
if args.resume_from is not None:
cfg.resume_from = args.resume_from
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids
else:
cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
if args.autoscale_lr:
# apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
# re-set gpu_ids with distributed training mode
_, world_size = get_dist_info()
cfg.gpu_ids = range(world_size)
# create work_dir
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
# init the logger before other steps
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
# init the meta dict to record some important information such as
# environment info and seed, which will be logged
meta = dict()
# log env info
env_info_dict = collect_env()
env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
dash_line = '-' * 60 + '\n'
logger.info('Environment info:\n' + dash_line + env_info + '\n' +
dash_line)
meta['env_info'] = env_info
# log some basic info
logger.info(f'Distributed training: {distributed}')
logger.info(f'Config:\n{cfg.pretty_text}')
# set random seeds
args.seed = 1
args.deterministic = True
if args.seed is not None:
logger.info(f'Set random seed to {args.seed}, '
f'deterministic: {args.deterministic}')
set_random_seed(args.seed, deterministic=args.deterministic)
cfg.seed = args.seed
meta['seed'] = args.seed
model = build_posenet(cfg.model)
train_datasets = [build_dataset(cfg.data.train)]
# if len(cfg.workflow) == 2:
# val_dataset = copy.deepcopy(cfg.data.val)
# val_dataset.pipeline = cfg.data.train.pipeline
# datasets.append(build_dataset(val_dataset))
val_dataset = copy.deepcopy(cfg.data.val)
val_dataset = build_dataset(val_dataset, dict(test_mode=True))
if cfg.checkpoint_config is not None:
# save mmpose version, config file content
# checkpoints as meta data
cfg.checkpoint_config.meta = dict(
mmpose_version=__version__ + get_git_hash(digits=7),
config=cfg.pretty_text,
)
train_model(
model,
train_datasets,
val_dataset,
cfg,
distributed=distributed,
validate=(not args.no_validate),
timestamp=timestamp,
meta=meta)
if __name__ == '__main__':
main()