Repository: Guaishou74851/AdcSR Branch: main Commit: d0b2871e3de9 Files: 54 Total size: 518.0 KB Directory structure: gitextract_c9hslnpw/ ├── LICENSE ├── README.md ├── bsr/ │ ├── degradations.py │ ├── transforms.py │ └── utils/ │ ├── __init__.py │ ├── color_util.py │ ├── diffjpeg.py │ ├── dist_util.py │ ├── download_util.py │ ├── file_client.py │ ├── flow_util.py │ ├── img_process_util.py │ ├── img_util.py │ ├── lmdb_util.py │ ├── logger.py │ ├── matlab_functions.py │ ├── misc.py │ ├── options.py │ ├── plot_util.py │ └── registry.py ├── config.yml ├── dataset.py ├── evaluate.py ├── evaluate_debug.sh ├── forward.py ├── model.py ├── ram/ │ ├── configs/ │ │ ├── condition_config.json │ │ ├── med_config.json │ │ ├── q2l_config.json │ │ └── swin/ │ │ ├── config_swinB_384.json │ │ ├── config_swinL_384.json │ │ └── config_swinL_444.json │ ├── data/ │ │ ├── ram_tag_list.txt │ │ ├── ram_tag_list_chinese.txt │ │ ├── ram_tag_list_threshold.txt │ │ └── tag_list.txt │ └── models/ │ ├── __init__.py │ ├── bert.py │ ├── bert_lora.py │ ├── ram.py │ ├── ram_lora.py │ ├── swin_transformer.py │ ├── swin_transformer_lora.py │ ├── tag2text.py │ ├── tag2text_lora.py │ ├── utils.py │ └── vit.py ├── requirements.txt ├── test.py ├── test_debug.sh ├── train.py ├── train.sh ├── train_debug.sh └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================

icon

# (CVPR 2025) Adversarial Diffusion Compression for Real-World Image Super-Resolution [PyTorch] [![icon](https://img.shields.io/badge/ArXiv-Paper-.svg)](https://arxiv.org/abs/2411.13383) [![Hugging Face](https://img.shields.io/badge/Code_&_Models-%F0%9F%A4%97%20Hugging%20Face-blue)](https://huggingface.co/Guaishou74851/AdcSR) ![visitors](https://visitor-badge.laobi.icu/badge?page_id=Guaishou74851.AdcSR) [Bin Chen](https://scholar.google.com/citations?user=aZDNm98AAAAJ)1,3,\* | [Gehui Li](https://github.com/cvsym)1,\* | [Rongyuan Wu](https://scholar.google.com/citations?user=A-U8zE8AAAAJ)2,3,\* | [Xindong Zhang](https://scholar.google.com/citations?user=q76RnqIAAAAJ)3 | [Jie Chen](https://aimia-pku.github.io/)1,† | [Jian Zhang](https://jianzhang.tech/)1,† | [Lei Zhang](https://www4.comp.polyu.edu.hk/~cslzhang/)2,3 1 *School of Electronic and Computer Engineering, Peking University* 2 *The Hong Kong Polytechnic University*, 3 *OPPO Research Institute* * Equal Contribution. Corresponding Authors. ⭐ **If AdcSR is helpful to you, please star this repo. Thanks!** 🤗 ## 📝 Overview ### Highlights - **Adversarial Diffusion Compression (ADC).** We remove and prune redundant modules from the one-step diffusion network [OSEDiff](https://github.com/cswry/OSEDiff) and apply adversarial distillation to retain generative capabilities despite reduced capacity. - **Real-Time [Stable Diffusion](https://huggingface.co/stabilityai/stable-diffusion-2-1)-Based Image Super-Resolution.** AdcSR super-resolves a 128×128 image to 512×512 **in just 0.03s 🚀** on an A100 GPU. - **Competitive Visual Quality.** Despite **74% fewer parameters 📉** than [OSEDiff](https://github.com/cswry/OSEDiff), AdcSR achieves **competitive image quality** across multiple benchmarks. ### Framework 1. **Structural Compression** - **Removable modules** (VAE encoder, text prompt extractor, cross-attention, time embeddings) are eliminated. - **Prunable modules** (UNet, VAE decoder) are **channel-pruned** to optimize efficiency while preserving performance.

teaser

2. **Two-Stage Training** 1. **Pretraining a Pruned VAE Decoder** to maintain its ability to decode latent representations. 2. **Adversarial Distillation** to align compressed network features with the teacher model (e.g., [OSEDiff](https://github.com/cswry/OSEDiff)) and ground truth images.

method

## 😍 Visual Results [](https://imgsli.com/MzU2MjU1) [](https://imgsli.com/MzU2MjU2) [](https://imgsli.com/MzU2MjU3) [](https://imgsli.com/MzU2NTg4) [](https://imgsli.com/MzU2NTkw) [](https://imgsli.com/MzU2NTk1) [](https://imgsli.com/MzU2OTE0) [](https://imgsli.com/MzU2OTE1) https://github.com/user-attachments/assets/1211cefa-8704-47f5-82cd-ec4ef084b9ec comp ## ⚙ Installation ```shell git clone https://github.com/Guaishou74851/AdcSR.git cd AdcSR conda create -n AdcSR python=3.10 conda activate AdcSR pip install --upgrade pip pip install -r requirements.txt chmod +x train.sh train_debug.sh test_debug.sh evaluate_debug.sh ``` ## ⚡ Test 1. **Download test datasets** (`DIV2K-Val.zip`, `DRealSR.zip`, `RealSR.zip`) from [Hugging Face](https://huggingface.co/Guaishou74851/AdcSR) or [PKU Disk](https://disk.pku.edu.cn/link/AAD499197CBF054392BC4061F904CC4026). 2. **Unzip** them into `./testset/`, ensuring the structure: ``` ./testset/DIV2K-Val/LR/xxx.png ./testset/DIV2K-Val/HR/xxx.png ./testset/DRealSR/LR/xxx.png ./testset/DRealSR/HR/xxx.png ./testset/RealSR/LR/xxx.png ./testset/RealSR/HR/xxx.png ``` 3. **Download model weights** (`net_params_200.pkl`) from the same link and place it in `./weight/`. 4. **Run the test script** (or modify and execute `./test_debug.sh` for convenience): ```bash python test.py --LR_dir=path_to_LR_images --SR_dir=path_to_SR_images ``` The results will be saved in `path_to_SR_images`. 5. **Test Your Own Images**: - Place your **low-resolution (LR)** images into `./testset/xxx/`. - Run the command with `--LR_dir=./testset/xxx/ --SR_dir=./yyy/`, and the model will perform **x4 super-resolution**. ## 🍭 Evaluation Run the evaluation script (or modify and execute `./evaluate_debug.sh` for convenience): ```bash python evaluate.py --HR_dir=path_to_HR_images --SR_dir=path_to_SR_images ``` ## 🔥 Train This repo provides code for **Stage 2** training (**adversarial distillation**). For **Stage 1** (pretraining the channel-pruned VAE decoder), refer to our paper and use the code of [Latent Diffusion Models](https://github.com/CompVis/latent-diffusion) repo. 1. **Download pretrained model weights** (`DAPE.pth`, `halfDecoder.ckpt`, `osediff.pkl`, `ram_swin_large_14m.pth`) from [Hugging Face](https://huggingface.co/Guaishou74851/AdcSR) or [PKU Disk](https://disk.pku.edu.cn/link/AAD499197CBF054392BC4061F904CC4026), and place them in `./weight/pretrained/`. 2. **Download the [LSDIR](https://huggingface.co/ofsoundof/LSDIR) dataset** and store it in your preferred location. 3. **Modify the dataset path** in `config.yml`: ```yaml dataroot_gt: path_to_HR_images_of_LSDIR ``` 4. **Run the training script** (or modify and execute `./train.sh` or `./train_debug.sh`): ```bash CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.run --nproc_per_node=8 --master_port=23333 train.py ``` The trained model will be saved in `./weight/`. ## 🥰 Acknowledgement This project is built upon the codes of [Latent Diffusion Models](https://github.com/CompVis/latent-diffusion), [Diffusers](https://github.com/huggingface/diffusers), [BasicSR](https://github.com/XPixelGroup/BasicSR), and [OSEDiff](https://github.com/cswry/OSEDiff). We sincerely thank the authors of these repos for their significant contributions. ## 🎓 Citation If you find our work helpful, please consider citing: ```latex @inproceedings{chen2025adversarial, title={Adversarial Diffusion Compression for Real-World Image Super-Resolution}, author={Chen, Bin and Li, Gehui and Wu, Rongyuan and Zhang, Xindong and Chen, Jie and Zhang, Jian and Zhang, Lei}, booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, year={2025} } ``` ================================================ FILE: bsr/degradations.py ================================================ import cv2 import math import numpy as np import random import torch from scipy import special from scipy.stats import multivariate_normal from torchvision.transforms._functional_tensor import rgb_to_grayscale # -------------------------------------------------------------------- # # --------------------------- blur kernels --------------------------- # # -------------------------------------------------------------------- # # --------------------------- util functions --------------------------- # def sigma_matrix2(sig_x, sig_y, theta): """Calculate the rotated sigma matrix (two dimensional matrix). Args: sig_x (float): sig_y (float): theta (float): Radian measurement. Returns: ndarray: Rotated sigma matrix. """ d_matrix = np.array([[sig_x**2, 0], [0, sig_y**2]]) u_matrix = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) return np.dot(u_matrix, np.dot(d_matrix, u_matrix.T)) def mesh_grid(kernel_size): """Generate the mesh grid, centering at zero. Args: kernel_size (int): Returns: xy (ndarray): with the shape (kernel_size, kernel_size, 2) xx (ndarray): with the shape (kernel_size, kernel_size) yy (ndarray): with the shape (kernel_size, kernel_size) """ ax = np.arange(-kernel_size // 2 + 1., kernel_size // 2 + 1.) xx, yy = np.meshgrid(ax, ax) xy = np.hstack((xx.reshape((kernel_size * kernel_size, 1)), yy.reshape(kernel_size * kernel_size, 1))).reshape(kernel_size, kernel_size, 2) return xy, xx, yy def pdf2(sigma_matrix, grid): """Calculate PDF of the bivariate Gaussian distribution. Args: sigma_matrix (ndarray): with the shape (2, 2) grid (ndarray): generated by :func:`mesh_grid`, with the shape (K, K, 2), K is the kernel size. Returns: kernel (ndarrray): un-normalized kernel. """ inverse_sigma = np.linalg.inv(sigma_matrix) kernel = np.exp(-0.5 * np.sum(np.dot(grid, inverse_sigma) * grid, 2)) return kernel def cdf2(d_matrix, grid): """Calculate the CDF of the standard bivariate Gaussian distribution. Used in skewed Gaussian distribution. Args: d_matrix (ndarrasy): skew matrix. grid (ndarray): generated by :func:`mesh_grid`, with the shape (K, K, 2), K is the kernel size. Returns: cdf (ndarray): skewed cdf. """ rv = multivariate_normal([0, 0], [[1, 0], [0, 1]]) grid = np.dot(grid, d_matrix) cdf = rv.cdf(grid) return cdf def bivariate_Gaussian(kernel_size, sig_x, sig_y, theta, grid=None, isotropic=True): """Generate a bivariate isotropic or anisotropic Gaussian kernel. In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored. Args: kernel_size (int): sig_x (float): sig_y (float): theta (float): Radian measurement. grid (ndarray, optional): generated by :func:`mesh_grid`, with the shape (K, K, 2), K is the kernel size. Default: None isotropic (bool): Returns: kernel (ndarray): normalized kernel. """ if grid is None: grid, _, _ = mesh_grid(kernel_size) if isotropic: sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]]) else: sigma_matrix = sigma_matrix2(sig_x, sig_y, theta) kernel = pdf2(sigma_matrix, grid) kernel = kernel / np.sum(kernel) return kernel def bivariate_generalized_Gaussian(kernel_size, sig_x, sig_y, theta, beta, grid=None, isotropic=True): """Generate a bivariate generalized Gaussian kernel. ``Paper: Parameter Estimation For Multivariate Generalized Gaussian Distributions`` In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored. Args: kernel_size (int): sig_x (float): sig_y (float): theta (float): Radian measurement. beta (float): shape parameter, beta = 1 is the normal distribution. grid (ndarray, optional): generated by :func:`mesh_grid`, with the shape (K, K, 2), K is the kernel size. Default: None Returns: kernel (ndarray): normalized kernel. """ if grid is None: grid, _, _ = mesh_grid(kernel_size) if isotropic: sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]]) else: sigma_matrix = sigma_matrix2(sig_x, sig_y, theta) inverse_sigma = np.linalg.inv(sigma_matrix) kernel = np.exp(-0.5 * np.power(np.sum(np.dot(grid, inverse_sigma) * grid, 2), beta)) kernel = kernel / np.sum(kernel) return kernel def bivariate_plateau(kernel_size, sig_x, sig_y, theta, beta, grid=None, isotropic=True): """Generate a plateau-like anisotropic kernel. 1 / (1+x^(beta)) Reference: https://stats.stackexchange.com/questions/203629/is-there-a-plateau-shaped-distribution In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored. Args: kernel_size (int): sig_x (float): sig_y (float): theta (float): Radian measurement. beta (float): shape parameter, beta = 1 is the normal distribution. grid (ndarray, optional): generated by :func:`mesh_grid`, with the shape (K, K, 2), K is the kernel size. Default: None Returns: kernel (ndarray): normalized kernel. """ if grid is None: grid, _, _ = mesh_grid(kernel_size) if isotropic: sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]]) else: sigma_matrix = sigma_matrix2(sig_x, sig_y, theta) inverse_sigma = np.linalg.inv(sigma_matrix) kernel = np.reciprocal(np.power(np.sum(np.dot(grid, inverse_sigma) * grid, 2), beta) + 1) kernel = kernel / np.sum(kernel) return kernel def random_bivariate_Gaussian(kernel_size, sigma_x_range, sigma_y_range, rotation_range, noise_range=None, isotropic=True): """Randomly generate bivariate isotropic or anisotropic Gaussian kernels. In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored. Args: kernel_size (int): sigma_x_range (tuple): [0.6, 5] sigma_y_range (tuple): [0.6, 5] rotation range (tuple): [-math.pi, math.pi] noise_range(tuple, optional): multiplicative kernel noise, [0.75, 1.25]. Default: None Returns: kernel (ndarray): """ assert kernel_size % 2 == 1, 'Kernel size must be an odd number.' assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.' sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1]) if isotropic is False: assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.' assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.' sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1]) rotation = np.random.uniform(rotation_range[0], rotation_range[1]) else: sigma_y = sigma_x rotation = 0 kernel = bivariate_Gaussian(kernel_size, sigma_x, sigma_y, rotation, isotropic=isotropic) # add multiplicative noise if noise_range is not None: assert noise_range[0] < noise_range[1], 'Wrong noise range.' noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape) kernel = kernel * noise kernel = kernel / np.sum(kernel) return kernel def random_bivariate_generalized_Gaussian(kernel_size, sigma_x_range, sigma_y_range, rotation_range, beta_range, noise_range=None, isotropic=True): """Randomly generate bivariate generalized Gaussian kernels. In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored. Args: kernel_size (int): sigma_x_range (tuple): [0.6, 5] sigma_y_range (tuple): [0.6, 5] rotation range (tuple): [-math.pi, math.pi] beta_range (tuple): [0.5, 8] noise_range(tuple, optional): multiplicative kernel noise, [0.75, 1.25]. Default: None Returns: kernel (ndarray): """ assert kernel_size % 2 == 1, 'Kernel size must be an odd number.' assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.' sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1]) if isotropic is False: assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.' assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.' sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1]) rotation = np.random.uniform(rotation_range[0], rotation_range[1]) else: sigma_y = sigma_x rotation = 0 # assume beta_range[0] < 1 < beta_range[1] if np.random.uniform() < 0.5: beta = np.random.uniform(beta_range[0], 1) else: beta = np.random.uniform(1, beta_range[1]) kernel = bivariate_generalized_Gaussian(kernel_size, sigma_x, sigma_y, rotation, beta, isotropic=isotropic) # add multiplicative noise if noise_range is not None: assert noise_range[0] < noise_range[1], 'Wrong noise range.' noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape) kernel = kernel * noise kernel = kernel / np.sum(kernel) return kernel def random_bivariate_plateau(kernel_size, sigma_x_range, sigma_y_range, rotation_range, beta_range, noise_range=None, isotropic=True): """Randomly generate bivariate plateau kernels. In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored. Args: kernel_size (int): sigma_x_range (tuple): [0.6, 5] sigma_y_range (tuple): [0.6, 5] rotation range (tuple): [-math.pi/2, math.pi/2] beta_range (tuple): [1, 4] noise_range(tuple, optional): multiplicative kernel noise, [0.75, 1.25]. Default: None Returns: kernel (ndarray): """ assert kernel_size % 2 == 1, 'Kernel size must be an odd number.' assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.' sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1]) if isotropic is False: assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.' assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.' sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1]) rotation = np.random.uniform(rotation_range[0], rotation_range[1]) else: sigma_y = sigma_x rotation = 0 # TODO: this may be not proper if np.random.uniform() < 0.5: beta = np.random.uniform(beta_range[0], 1) else: beta = np.random.uniform(1, beta_range[1]) kernel = bivariate_plateau(kernel_size, sigma_x, sigma_y, rotation, beta, isotropic=isotropic) # add multiplicative noise if noise_range is not None: assert noise_range[0] < noise_range[1], 'Wrong noise range.' noise = np.random.uniform(noise_range[0], noise_range[1], size=kernel.shape) kernel = kernel * noise kernel = kernel / np.sum(kernel) return kernel def random_mixed_kernels(kernel_list, kernel_prob, kernel_size=21, sigma_x_range=(0.6, 5), sigma_y_range=(0.6, 5), rotation_range=(-math.pi, math.pi), betag_range=(0.5, 8), betap_range=(0.5, 8), noise_range=None): """Randomly generate mixed kernels. Args: kernel_list (tuple): a list name of kernel types, support ['iso', 'aniso', 'skew', 'generalized', 'plateau_iso', 'plateau_aniso'] kernel_prob (tuple): corresponding kernel probability for each kernel type kernel_size (int): sigma_x_range (tuple): [0.6, 5] sigma_y_range (tuple): [0.6, 5] rotation range (tuple): [-math.pi, math.pi] beta_range (tuple): [0.5, 8] noise_range(tuple, optional): multiplicative kernel noise, [0.75, 1.25]. Default: None Returns: kernel (ndarray): """ kernel_type = random.choices(kernel_list, kernel_prob)[0] if kernel_type == 'iso': kernel = random_bivariate_Gaussian( kernel_size, sigma_x_range, sigma_y_range, rotation_range, noise_range=noise_range, isotropic=True) elif kernel_type == 'aniso': kernel = random_bivariate_Gaussian( kernel_size, sigma_x_range, sigma_y_range, rotation_range, noise_range=noise_range, isotropic=False) elif kernel_type == 'generalized_iso': kernel = random_bivariate_generalized_Gaussian( kernel_size, sigma_x_range, sigma_y_range, rotation_range, betag_range, noise_range=noise_range, isotropic=True) elif kernel_type == 'generalized_aniso': kernel = random_bivariate_generalized_Gaussian( kernel_size, sigma_x_range, sigma_y_range, rotation_range, betag_range, noise_range=noise_range, isotropic=False) elif kernel_type == 'plateau_iso': kernel = random_bivariate_plateau( kernel_size, sigma_x_range, sigma_y_range, rotation_range, betap_range, noise_range=None, isotropic=True) elif kernel_type == 'plateau_aniso': kernel = random_bivariate_plateau( kernel_size, sigma_x_range, sigma_y_range, rotation_range, betap_range, noise_range=None, isotropic=False) return kernel np.seterr(divide='ignore', invalid='ignore') def circular_lowpass_kernel(cutoff, kernel_size, pad_to=0): """2D sinc filter Reference: https://dsp.stackexchange.com/questions/58301/2-d-circularly-symmetric-low-pass-filter Args: cutoff (float): cutoff frequency in radians (pi is max) kernel_size (int): horizontal and vertical size, must be odd. pad_to (int): pad kernel size to desired size, must be odd or zero. """ assert kernel_size % 2 == 1, 'Kernel size must be an odd number.' kernel = np.fromfunction( lambda x, y: cutoff * special.j1(cutoff * np.sqrt( (x - (kernel_size - 1) / 2)**2 + (y - (kernel_size - 1) / 2)**2)) / (2 * np.pi * np.sqrt( (x - (kernel_size - 1) / 2)**2 + (y - (kernel_size - 1) / 2)**2)), [kernel_size, kernel_size]) kernel[(kernel_size - 1) // 2, (kernel_size - 1) // 2] = cutoff**2 / (4 * np.pi) kernel = kernel / np.sum(kernel) if pad_to > kernel_size: pad_size = (pad_to - kernel_size) // 2 kernel = np.pad(kernel, ((pad_size, pad_size), (pad_size, pad_size))) return kernel # ------------------------------------------------------------- # # --------------------------- noise --------------------------- # # ------------------------------------------------------------- # # ----------------------- Gaussian Noise ----------------------- # def generate_gaussian_noise(img, sigma=10, gray_noise=False): """Generate Gaussian noise. Args: img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32. sigma (float): Noise scale (measured in range 255). Default: 10. Returns: (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1], float32. """ if gray_noise: noise = np.float32(np.random.randn(*(img.shape[0:2]))) * sigma / 255. noise = np.expand_dims(noise, axis=2).repeat(3, axis=2) else: noise = np.float32(np.random.randn(*(img.shape))) * sigma / 255. return noise def add_gaussian_noise(img, sigma=10, clip=True, rounds=False, gray_noise=False): """Add Gaussian noise. Args: img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32. sigma (float): Noise scale (measured in range 255). Default: 10. Returns: (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1], float32. """ noise = generate_gaussian_noise(img, sigma, gray_noise) out = img + noise if clip and rounds: out = np.clip((out * 255.0).round(), 0, 255) / 255. elif clip: out = np.clip(out, 0, 1) elif rounds: out = (out * 255.0).round() / 255. return out def generate_gaussian_noise_pt(img, sigma=10, gray_noise=0): """Add Gaussian noise (PyTorch version). Args: img (Tensor): Shape (b, c, h, w), range[0, 1], float32. scale (float | Tensor): Noise scale. Default: 1.0. Returns: (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1], float32. """ b, _, h, w = img.size() if not isinstance(sigma, (float, int)): sigma = sigma.view(img.size(0), 1, 1, 1) if isinstance(gray_noise, (float, int)): cal_gray_noise = gray_noise > 0 else: gray_noise = gray_noise.view(b, 1, 1, 1) cal_gray_noise = torch.sum(gray_noise) > 0 if cal_gray_noise: noise_gray = torch.randn(*img.size()[2:4], dtype=img.dtype, device=img.device) * sigma / 255. noise_gray = noise_gray.view(b, 1, h, w) # always calculate color noise noise = torch.randn(*img.size(), dtype=img.dtype, device=img.device) * sigma / 255. if cal_gray_noise: noise = noise * (1 - gray_noise) + noise_gray * gray_noise return noise def add_gaussian_noise_pt(img, sigma=10, gray_noise=0, clip=True, rounds=False): """Add Gaussian noise (PyTorch version). Args: img (Tensor): Shape (b, c, h, w), range[0, 1], float32. scale (float | Tensor): Noise scale. Default: 1.0. Returns: (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1], float32. """ noise = generate_gaussian_noise_pt(img, sigma, gray_noise) out = img + noise if clip and rounds: out = torch.clamp((out * 255.0).round(), 0, 255) / 255. elif clip: out = torch.clamp(out, 0, 1) elif rounds: out = (out * 255.0).round() / 255. return out # ----------------------- Random Gaussian Noise ----------------------- # def random_generate_gaussian_noise(img, sigma_range=(0, 10), gray_prob=0): sigma = np.random.uniform(sigma_range[0], sigma_range[1]) if np.random.uniform() < gray_prob: gray_noise = True else: gray_noise = False return generate_gaussian_noise(img, sigma, gray_noise) def random_add_gaussian_noise(img, sigma_range=(0, 1.0), gray_prob=0, clip=True, rounds=False): noise = random_generate_gaussian_noise(img, sigma_range, gray_prob) out = img + noise if clip and rounds: out = np.clip((out * 255.0).round(), 0, 255) / 255. elif clip: out = np.clip(out, 0, 1) elif rounds: out = (out * 255.0).round() / 255. return out def random_generate_gaussian_noise_pt(img, sigma_range=(0, 10), gray_prob=0): sigma = torch.rand( img.size(0), dtype=img.dtype, device=img.device) * (sigma_range[1] - sigma_range[0]) + sigma_range[0] gray_noise = torch.rand(img.size(0), dtype=img.dtype, device=img.device) gray_noise = (gray_noise < gray_prob).float() return generate_gaussian_noise_pt(img, sigma, gray_noise) def random_add_gaussian_noise_pt(img, sigma_range=(0, 1.0), gray_prob=0, clip=True, rounds=False): noise = random_generate_gaussian_noise_pt(img, sigma_range, gray_prob) out = img + noise if clip and rounds: out = torch.clamp((out * 255.0).round(), 0, 255) / 255. elif clip: out = torch.clamp(out, 0, 1) elif rounds: out = (out * 255.0).round() / 255. return out # ----------------------- Poisson (Shot) Noise ----------------------- # def generate_poisson_noise(img, scale=1.0, gray_noise=False): """Generate poisson noise. Reference: https://github.com/scikit-image/scikit-image/blob/main/skimage/util/noise.py#L37-L219 Args: img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32. scale (float): Noise scale. Default: 1.0. gray_noise (bool): Whether generate gray noise. Default: False. Returns: (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1], float32. """ if gray_noise: img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # round and clip image for counting vals correctly img = np.clip((img * 255.0).round(), 0, 255) / 255. vals = len(np.unique(img)) vals = 2**np.ceil(np.log2(vals)) out = np.float32(np.random.poisson(img * vals) / float(vals)) noise = out - img if gray_noise: noise = np.repeat(noise[:, :, np.newaxis], 3, axis=2) return noise * scale def add_poisson_noise(img, scale=1.0, clip=True, rounds=False, gray_noise=False): """Add poisson noise. Args: img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32. scale (float): Noise scale. Default: 1.0. gray_noise (bool): Whether generate gray noise. Default: False. Returns: (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1], float32. """ noise = generate_poisson_noise(img, scale, gray_noise) out = img + noise if clip and rounds: out = np.clip((out * 255.0).round(), 0, 255) / 255. elif clip: out = np.clip(out, 0, 1) elif rounds: out = (out * 255.0).round() / 255. return out def generate_poisson_noise_pt(img, scale=1.0, gray_noise=0): """Generate a batch of poisson noise (PyTorch version) Args: img (Tensor): Input image, shape (b, c, h, w), range [0, 1], float32. scale (float | Tensor): Noise scale. Number or Tensor with shape (b). Default: 1.0. gray_noise (float | Tensor): 0-1 number or Tensor with shape (b). 0 for False, 1 for True. Default: 0. Returns: (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1], float32. """ b, _, h, w = img.size() if isinstance(gray_noise, (float, int)): cal_gray_noise = gray_noise > 0 else: gray_noise = gray_noise.view(b, 1, 1, 1) cal_gray_noise = torch.sum(gray_noise) > 0 if cal_gray_noise: img_gray = rgb_to_grayscale(img, num_output_channels=1) # round and clip image for counting vals correctly img_gray = torch.clamp((img_gray * 255.0).round(), 0, 255) / 255. # use for-loop to get the unique values for each sample vals_list = [len(torch.unique(img_gray[i, :, :, :])) for i in range(b)] vals_list = [2**np.ceil(np.log2(vals)) for vals in vals_list] vals = img_gray.new_tensor(vals_list).view(b, 1, 1, 1) out = torch.poisson(img_gray * vals) / vals noise_gray = out - img_gray noise_gray = noise_gray.expand(b, 3, h, w) # always calculate color noise # round and clip image for counting vals correctly img = torch.clamp((img * 255.0).round(), 0, 255) / 255. # use for-loop to get the unique values for each sample vals_list = [len(torch.unique(img[i, :, :, :])) for i in range(b)] vals_list = [2**np.ceil(np.log2(vals)) for vals in vals_list] vals = img.new_tensor(vals_list).view(b, 1, 1, 1) out = torch.poisson(img * vals) / vals noise = out - img if cal_gray_noise: noise = noise * (1 - gray_noise) + noise_gray * gray_noise if not isinstance(scale, (float, int)): scale = scale.view(b, 1, 1, 1) return noise * scale def add_poisson_noise_pt(img, scale=1.0, clip=True, rounds=False, gray_noise=0): """Add poisson noise to a batch of images (PyTorch version). Args: img (Tensor): Input image, shape (b, c, h, w), range [0, 1], float32. scale (float | Tensor): Noise scale. Number or Tensor with shape (b). Default: 1.0. gray_noise (float | Tensor): 0-1 number or Tensor with shape (b). 0 for False, 1 for True. Default: 0. Returns: (Tensor): Returned noisy image, shape (b, c, h, w), range[0, 1], float32. """ noise = generate_poisson_noise_pt(img, scale, gray_noise) out = img + noise if clip and rounds: out = torch.clamp((out * 255.0).round(), 0, 255) / 255. elif clip: out = torch.clamp(out, 0, 1) elif rounds: out = (out * 255.0).round() / 255. return out # ----------------------- Random Poisson (Shot) Noise ----------------------- # def random_generate_poisson_noise(img, scale_range=(0, 1.0), gray_prob=0): scale = np.random.uniform(scale_range[0], scale_range[1]) if np.random.uniform() < gray_prob: gray_noise = True else: gray_noise = False return generate_poisson_noise(img, scale, gray_noise) def random_add_poisson_noise(img, scale_range=(0, 1.0), gray_prob=0, clip=True, rounds=False): noise = random_generate_poisson_noise(img, scale_range, gray_prob) out = img + noise if clip and rounds: out = np.clip((out * 255.0).round(), 0, 255) / 255. elif clip: out = np.clip(out, 0, 1) elif rounds: out = (out * 255.0).round() / 255. return out def random_generate_poisson_noise_pt(img, scale_range=(0, 1.0), gray_prob=0): scale = torch.rand( img.size(0), dtype=img.dtype, device=img.device) * (scale_range[1] - scale_range[0]) + scale_range[0] gray_noise = torch.rand(img.size(0), dtype=img.dtype, device=img.device) gray_noise = (gray_noise < gray_prob).float() return generate_poisson_noise_pt(img, scale, gray_noise) def random_add_poisson_noise_pt(img, scale_range=(0, 1.0), gray_prob=0, clip=True, rounds=False): noise = random_generate_poisson_noise_pt(img, scale_range, gray_prob) out = img + noise if clip and rounds: out = torch.clamp((out * 255.0).round(), 0, 255) / 255. elif clip: out = torch.clamp(out, 0, 1) elif rounds: out = (out * 255.0).round() / 255. return out # ------------------------------------------------------------------------ # # --------------------------- JPEG compression --------------------------- # # ------------------------------------------------------------------------ # def add_jpg_compression(img, quality=90): """Add JPG compression artifacts. Args: img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32. quality (float): JPG compression quality. 0 for lowest quality, 100 for best quality. Default: 90. Returns: (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1], float32. """ img = np.clip(img, 0, 1) encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality] _, encimg = cv2.imencode('.jpg', img * 255., encode_param) img = np.float32(cv2.imdecode(encimg, 1)) / 255. return img def random_add_jpg_compression(img, quality_range=(90, 100)): """Randomly add JPG compression artifacts. Args: img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32. quality_range (tuple[float] | list[float]): JPG compression quality range. 0 for lowest quality, 100 for best quality. Default: (90, 100). Returns: (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1], float32. """ quality = np.random.uniform(quality_range[0], quality_range[1]) return add_jpg_compression(img, quality) ================================================ FILE: bsr/transforms.py ================================================ import cv2 import random import torch def mod_crop(img, scale): """Mod crop images, used during testing. Args: img (ndarray): Input image. scale (int): Scale factor. Returns: ndarray: Result image. """ img = img.copy() if img.ndim in (2, 3): h, w = img.shape[0], img.shape[1] h_remainder, w_remainder = h % scale, w % scale img = img[:h - h_remainder, :w - w_remainder, ...] else: raise ValueError(f'Wrong img ndim: {img.ndim}.') return img def paired_random_crop(img_gts, img_lqs, gt_patch_size, scale, gt_path=None): """Paired random crop. Support Numpy array and Tensor inputs. It crops lists of lq and gt images with corresponding locations. Args: img_gts (list[ndarray] | ndarray | list[Tensor] | Tensor): GT images. Note that all images should have the same shape. If the input is an ndarray, it will be transformed to a list containing itself. img_lqs (list[ndarray] | ndarray): LQ images. Note that all images should have the same shape. If the input is an ndarray, it will be transformed to a list containing itself. gt_patch_size (int): GT patch size. scale (int): Scale factor. gt_path (str): Path to ground-truth. Default: None. Returns: list[ndarray] | ndarray: GT images and LQ images. If returned results only have one element, just return ndarray. """ if not isinstance(img_gts, list): img_gts = [img_gts] if not isinstance(img_lqs, list): img_lqs = [img_lqs] # determine input type: Numpy array or Tensor input_type = 'Tensor' if torch.is_tensor(img_gts[0]) else 'Numpy' if input_type == 'Tensor': h_lq, w_lq = img_lqs[0].size()[-2:] h_gt, w_gt = img_gts[0].size()[-2:] else: h_lq, w_lq = img_lqs[0].shape[0:2] h_gt, w_gt = img_gts[0].shape[0:2] lq_patch_size = gt_patch_size // scale if h_gt != h_lq * scale or w_gt != w_lq * scale: raise ValueError(f'Scale mismatches. GT ({h_gt}, {w_gt}) is not {scale}x ', f'multiplication of LQ ({h_lq}, {w_lq}).') if h_lq < lq_patch_size or w_lq < lq_patch_size: raise ValueError(f'LQ ({h_lq}, {w_lq}) is smaller than patch size ' f'({lq_patch_size}, {lq_patch_size}). ' f'Please remove {gt_path}.') # randomly choose top and left coordinates for lq patch top = random.randint(0, h_lq - lq_patch_size) left = random.randint(0, w_lq - lq_patch_size) # crop lq patch if input_type == 'Tensor': img_lqs = [v[:, :, top:top + lq_patch_size, left:left + lq_patch_size] for v in img_lqs] else: img_lqs = [v[top:top + lq_patch_size, left:left + lq_patch_size, ...] for v in img_lqs] # crop corresponding gt patch top_gt, left_gt = int(top * scale), int(left * scale) if input_type == 'Tensor': img_gts = [v[:, :, top_gt:top_gt + gt_patch_size, left_gt:left_gt + gt_patch_size] for v in img_gts] else: img_gts = [v[top_gt:top_gt + gt_patch_size, left_gt:left_gt + gt_patch_size, ...] for v in img_gts] if len(img_gts) == 1: img_gts = img_gts[0] if len(img_lqs) == 1: img_lqs = img_lqs[0] return img_gts, img_lqs def augment(imgs, hflip=True, rotation=True, flows=None, return_status=False): """Augment: horizontal flips OR rotate (0, 90, 180, 270 degrees). We use vertical flip and transpose for rotation implementation. All the images in the list use the same augmentation. Args: imgs (list[ndarray] | ndarray): Images to be augmented. If the input is an ndarray, it will be transformed to a list. hflip (bool): Horizontal flip. Default: True. rotation (bool): Ratotation. Default: True. flows (list[ndarray]: Flows to be augmented. If the input is an ndarray, it will be transformed to a list. Dimension is (h, w, 2). Default: None. return_status (bool): Return the status of flip and rotation. Default: False. Returns: list[ndarray] | ndarray: Augmented images and flows. If returned results only have one element, just return ndarray. """ hflip = hflip and random.random() < 0.5 vflip = rotation and random.random() < 0.5 rot90 = rotation and random.random() < 0.5 def _augment(img): if hflip: # horizontal cv2.flip(img, 1, img) if vflip: # vertical cv2.flip(img, 0, img) if rot90: img = img.transpose(1, 0, 2) return img def _augment_flow(flow): if hflip: # horizontal cv2.flip(flow, 1, flow) flow[:, :, 0] *= -1 if vflip: # vertical cv2.flip(flow, 0, flow) flow[:, :, 1] *= -1 if rot90: flow = flow.transpose(1, 0, 2) flow = flow[:, :, [1, 0]] return flow if not isinstance(imgs, list): imgs = [imgs] imgs = [_augment(img) for img in imgs] if len(imgs) == 1: imgs = imgs[0] if flows is not None: if not isinstance(flows, list): flows = [flows] flows = [_augment_flow(flow) for flow in flows] if len(flows) == 1: flows = flows[0] return imgs, flows else: if return_status: return imgs, (hflip, vflip, rot90) else: return imgs def img_rotate(img, angle, center=None, scale=1.0): """Rotate image. Args: img (ndarray): Image to be rotated. angle (float): Rotation angle in degrees. Positive values mean counter-clockwise rotation. center (tuple[int]): Rotation center. If the center is None, initialize it as the center of the image. Default: None. scale (float): Isotropic scale factor. Default: 1.0. """ (h, w) = img.shape[:2] if center is None: center = (w // 2, h // 2) matrix = cv2.getRotationMatrix2D(center, angle, scale) rotated_img = cv2.warpAffine(img, matrix, (w, h)) return rotated_img ================================================ FILE: bsr/utils/__init__.py ================================================ from .color_util import bgr2ycbcr, rgb2ycbcr, rgb2ycbcr_pt, ycbcr2bgr, ycbcr2rgb from .diffjpeg import DiffJPEG from .file_client import FileClient from .img_process_util import USMSharp, usm_sharp from .img_util import crop_border, imfrombytes, img2tensor, imwrite, tensor2img from .logger import AvgTimer, MessageLogger, get_env_info, get_root_logger, init_tb_logger, init_wandb_logger from .misc import check_resume, get_time_str, make_exp_dirs, mkdir_and_rename, scandir, set_random_seed, sizeof_fmt from .options import yaml_load __all__ = [ # color_util.py 'bgr2ycbcr', 'rgb2ycbcr', 'rgb2ycbcr_pt', 'ycbcr2bgr', 'ycbcr2rgb', # file_client.py 'FileClient', # img_util.py 'img2tensor', 'tensor2img', 'imfrombytes', 'imwrite', 'crop_border', # logger.py 'MessageLogger', 'AvgTimer', 'init_tb_logger', 'init_wandb_logger', 'get_root_logger', 'get_env_info', # misc.py 'set_random_seed', 'get_time_str', 'mkdir_and_rename', 'make_exp_dirs', 'scandir', 'check_resume', 'sizeof_fmt', # diffjpeg 'DiffJPEG', # img_process_util 'USMSharp', 'usm_sharp', # options 'yaml_load' ] ================================================ FILE: bsr/utils/color_util.py ================================================ import numpy as np import torch def rgb2ycbcr(img, y_only=False): """Convert a RGB image to YCbCr image. This function produces the same results as Matlab's `rgb2ycbcr` function. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. y_only (bool): Whether to only return Y channel. Default: False. Returns: ndarray: The converted YCbCr image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) if y_only: out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0 else: out_img = np.matmul( img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786], [24.966, 112.0, -18.214]]) + [16, 128, 128] out_img = _convert_output_type_range(out_img, img_type) return out_img def bgr2ycbcr(img, y_only=False): """Convert a BGR image to YCbCr image. The bgr version of rgb2ycbcr. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. y_only (bool): Whether to only return Y channel. Default: False. Returns: ndarray: The converted YCbCr image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) if y_only: out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0 else: out_img = np.matmul( img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786], [65.481, -37.797, 112.0]]) + [16, 128, 128] out_img = _convert_output_type_range(out_img, img_type) return out_img def ycbcr2rgb(img): """Convert a YCbCr image to RGB image. This function produces the same results as Matlab's ycbcr2rgb function. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. Returns: ndarray: The converted RGB image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) * 255 out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071], [0.00625893, -0.00318811, 0]]) * 255.0 + [-222.921, 135.576, -276.836] # noqa: E126 out_img = _convert_output_type_range(out_img, img_type) return out_img def ycbcr2bgr(img): """Convert a YCbCr image to BGR image. The bgr version of ycbcr2rgb. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. Returns: ndarray: The converted BGR image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) * 255 out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0.00791071, -0.00153632, 0], [0, -0.00318811, 0.00625893]]) * 255.0 + [-276.836, 135.576, -222.921] # noqa: E126 out_img = _convert_output_type_range(out_img, img_type) return out_img def _convert_input_type_range(img): """Convert the type and range of the input image. It converts the input image to np.float32 type and range of [0, 1]. It is mainly used for pre-processing the input image in colorspace conversion functions such as rgb2ycbcr and ycbcr2rgb. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. Returns: (ndarray): The converted image with type of np.float32 and range of [0, 1]. """ img_type = img.dtype img = img.astype(np.float32) if img_type == np.float32: pass elif img_type == np.uint8: img /= 255. else: raise TypeError(f'The img type should be np.float32 or np.uint8, but got {img_type}') return img def _convert_output_type_range(img, dst_type): """Convert the type and range of the image according to dst_type. It converts the image to desired type and range. If `dst_type` is np.uint8, images will be converted to np.uint8 type with range [0, 255]. If `dst_type` is np.float32, it converts the image to np.float32 type with range [0, 1]. It is mainly used for post-processing images in colorspace conversion functions such as rgb2ycbcr and ycbcr2rgb. Args: img (ndarray): The image to be converted with np.float32 type and range [0, 255]. dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it converts the image to np.uint8 type with range [0, 255]. If dst_type is np.float32, it converts the image to np.float32 type with range [0, 1]. Returns: (ndarray): The converted image with desired type and range. """ if dst_type not in (np.uint8, np.float32): raise TypeError(f'The dst_type should be np.float32 or np.uint8, but got {dst_type}') if dst_type == np.uint8: img = img.round() else: img /= 255. return img.astype(dst_type) def rgb2ycbcr_pt(img, y_only=False): """Convert RGB images to YCbCr images (PyTorch version). It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. Args: img (Tensor): Images with shape (n, 3, h, w), the range [0, 1], float, RGB format. y_only (bool): Whether to only return Y channel. Default: False. Returns: (Tensor): converted images with the shape (n, 3/1, h, w), the range [0, 1], float. """ if y_only: weight = torch.tensor([[65.481], [128.553], [24.966]]).to(img) out_img = torch.matmul(img.permute(0, 2, 3, 1), weight).permute(0, 3, 1, 2) + 16.0 else: weight = torch.tensor([[65.481, -37.797, 112.0], [128.553, -74.203, -93.786], [24.966, 112.0, -18.214]]).to(img) bias = torch.tensor([16, 128, 128]).view(1, 3, 1, 1).to(img) out_img = torch.matmul(img.permute(0, 2, 3, 1), weight).permute(0, 3, 1, 2) + bias out_img = out_img / 255. return out_img ================================================ FILE: bsr/utils/diffjpeg.py ================================================ """ Modified from https://github.com/mlomnitz/DiffJPEG For images not divisible by 8 https://dsp.stackexchange.com/questions/35339/jpeg-dct-padding/35343#35343 """ import itertools import numpy as np import torch import torch.nn as nn from torch.nn import functional as F # ------------------------ utils ------------------------# y_table = np.array( [[16, 11, 10, 16, 24, 40, 51, 61], [12, 12, 14, 19, 26, 58, 60, 55], [14, 13, 16, 24, 40, 57, 69, 56], [14, 17, 22, 29, 51, 87, 80, 62], [18, 22, 37, 56, 68, 109, 103, 77], [24, 35, 55, 64, 81, 104, 113, 92], [49, 64, 78, 87, 103, 121, 120, 101], [72, 92, 95, 98, 112, 100, 103, 99]], dtype=np.float32).T y_table = nn.Parameter(torch.from_numpy(y_table)) c_table = np.empty((8, 8), dtype=np.float32) c_table.fill(99) c_table[:4, :4] = np.array([[17, 18, 24, 47], [18, 21, 26, 66], [24, 26, 56, 99], [47, 66, 99, 99]]).T c_table = nn.Parameter(torch.from_numpy(c_table)) def diff_round(x): """ Differentiable rounding function """ return torch.round(x) + (x - torch.round(x))**3 def quality_to_factor(quality): """ Calculate factor corresponding to quality Args: quality(float): Quality for jpeg compression. Returns: float: Compression factor. """ if quality < 50: quality = 5000. / quality else: quality = 200. - quality * 2 return quality / 100. # ------------------------ compression ------------------------# class RGB2YCbCrJpeg(nn.Module): """ Converts RGB image to YCbCr """ def __init__(self): super(RGB2YCbCrJpeg, self).__init__() matrix = np.array([[0.299, 0.587, 0.114], [-0.168736, -0.331264, 0.5], [0.5, -0.418688, -0.081312]], dtype=np.float32).T self.shift = nn.Parameter(torch.tensor([0., 128., 128.])) self.matrix = nn.Parameter(torch.from_numpy(matrix)) def forward(self, image): """ Args: image(Tensor): batch x 3 x height x width Returns: Tensor: batch x height x width x 3 """ image = image.permute(0, 2, 3, 1) result = torch.tensordot(image, self.matrix, dims=1) + self.shift return result.view(image.shape) class ChromaSubsampling(nn.Module): """ Chroma subsampling on CbCr channels """ def __init__(self): super(ChromaSubsampling, self).__init__() def forward(self, image): """ Args: image(tensor): batch x height x width x 3 Returns: y(tensor): batch x height x width cb(tensor): batch x height/2 x width/2 cr(tensor): batch x height/2 x width/2 """ image_2 = image.permute(0, 3, 1, 2).clone() cb = F.avg_pool2d(image_2[:, 1, :, :].unsqueeze(1), kernel_size=2, stride=(2, 2), count_include_pad=False) cr = F.avg_pool2d(image_2[:, 2, :, :].unsqueeze(1), kernel_size=2, stride=(2, 2), count_include_pad=False) cb = cb.permute(0, 2, 3, 1) cr = cr.permute(0, 2, 3, 1) return image[:, :, :, 0], cb.squeeze(3), cr.squeeze(3) class BlockSplitting(nn.Module): """ Splitting image into patches """ def __init__(self): super(BlockSplitting, self).__init__() self.k = 8 def forward(self, image): """ Args: image(tensor): batch x height x width Returns: Tensor: batch x h*w/64 x h x w """ height, _ = image.shape[1:3] batch_size = image.shape[0] image_reshaped = image.view(batch_size, height // self.k, self.k, -1, self.k) image_transposed = image_reshaped.permute(0, 1, 3, 2, 4) return image_transposed.contiguous().view(batch_size, -1, self.k, self.k) class DCT8x8(nn.Module): """ Discrete Cosine Transformation """ def __init__(self): super(DCT8x8, self).__init__() tensor = np.zeros((8, 8, 8, 8), dtype=np.float32) for x, y, u, v in itertools.product(range(8), repeat=4): tensor[x, y, u, v] = np.cos((2 * x + 1) * u * np.pi / 16) * np.cos((2 * y + 1) * v * np.pi / 16) alpha = np.array([1. / np.sqrt(2)] + [1] * 7) self.tensor = nn.Parameter(torch.from_numpy(tensor).float()) self.scale = nn.Parameter(torch.from_numpy(np.outer(alpha, alpha) * 0.25).float()) def forward(self, image): """ Args: image(tensor): batch x height x width Returns: Tensor: batch x height x width """ image = image - 128 result = self.scale * torch.tensordot(image, self.tensor, dims=2) result.view(image.shape) return result class YQuantize(nn.Module): """ JPEG Quantization for Y channel Args: rounding(function): rounding function to use """ def __init__(self, rounding): super(YQuantize, self).__init__() self.rounding = rounding self.y_table = y_table def forward(self, image, factor=1): """ Args: image(tensor): batch x height x width Returns: Tensor: batch x height x width """ if isinstance(factor, (int, float)): image = image.float() / (self.y_table * factor) else: b = factor.size(0) table = self.y_table.expand(b, 1, 8, 8) * factor.view(b, 1, 1, 1) image = image.float() / table image = self.rounding(image) return image class CQuantize(nn.Module): """ JPEG Quantization for CbCr channels Args: rounding(function): rounding function to use """ def __init__(self, rounding): super(CQuantize, self).__init__() self.rounding = rounding self.c_table = c_table def forward(self, image, factor=1): """ Args: image(tensor): batch x height x width Returns: Tensor: batch x height x width """ if isinstance(factor, (int, float)): image = image.float() / (self.c_table * factor) else: b = factor.size(0) table = self.c_table.expand(b, 1, 8, 8) * factor.view(b, 1, 1, 1) image = image.float() / table image = self.rounding(image) return image class CompressJpeg(nn.Module): """Full JPEG compression algorithm Args: rounding(function): rounding function to use """ def __init__(self, rounding=torch.round): super(CompressJpeg, self).__init__() self.l1 = nn.Sequential(RGB2YCbCrJpeg(), ChromaSubsampling()) self.l2 = nn.Sequential(BlockSplitting(), DCT8x8()) self.c_quantize = CQuantize(rounding=rounding) self.y_quantize = YQuantize(rounding=rounding) def forward(self, image, factor=1): """ Args: image(tensor): batch x 3 x height x width Returns: dict(tensor): Compressed tensor with batch x h*w/64 x 8 x 8. """ y, cb, cr = self.l1(image * 255) components = {'y': y, 'cb': cb, 'cr': cr} for k in components.keys(): comp = self.l2(components[k]) if k in ('cb', 'cr'): comp = self.c_quantize(comp, factor=factor) else: comp = self.y_quantize(comp, factor=factor) components[k] = comp return components['y'], components['cb'], components['cr'] # ------------------------ decompression ------------------------# class YDequantize(nn.Module): """Dequantize Y channel """ def __init__(self): super(YDequantize, self).__init__() self.y_table = y_table def forward(self, image, factor=1): """ Args: image(tensor): batch x height x width Returns: Tensor: batch x height x width """ if isinstance(factor, (int, float)): out = image * (self.y_table * factor) else: b = factor.size(0) table = self.y_table.expand(b, 1, 8, 8) * factor.view(b, 1, 1, 1) out = image * table return out class CDequantize(nn.Module): """Dequantize CbCr channel """ def __init__(self): super(CDequantize, self).__init__() self.c_table = c_table def forward(self, image, factor=1): """ Args: image(tensor): batch x height x width Returns: Tensor: batch x height x width """ if isinstance(factor, (int, float)): out = image * (self.c_table * factor) else: b = factor.size(0) table = self.c_table.expand(b, 1, 8, 8) * factor.view(b, 1, 1, 1) out = image * table return out class iDCT8x8(nn.Module): """Inverse discrete Cosine Transformation """ def __init__(self): super(iDCT8x8, self).__init__() alpha = np.array([1. / np.sqrt(2)] + [1] * 7) self.alpha = nn.Parameter(torch.from_numpy(np.outer(alpha, alpha)).float()) tensor = np.zeros((8, 8, 8, 8), dtype=np.float32) for x, y, u, v in itertools.product(range(8), repeat=4): tensor[x, y, u, v] = np.cos((2 * u + 1) * x * np.pi / 16) * np.cos((2 * v + 1) * y * np.pi / 16) self.tensor = nn.Parameter(torch.from_numpy(tensor).float()) def forward(self, image): """ Args: image(tensor): batch x height x width Returns: Tensor: batch x height x width """ image = image * self.alpha result = 0.25 * torch.tensordot(image, self.tensor, dims=2) + 128 result.view(image.shape) return result class BlockMerging(nn.Module): """Merge patches into image """ def __init__(self): super(BlockMerging, self).__init__() def forward(self, patches, height, width): """ Args: patches(tensor) batch x height*width/64, height x width height(int) width(int) Returns: Tensor: batch x height x width """ k = 8 batch_size = patches.shape[0] image_reshaped = patches.view(batch_size, height // k, width // k, k, k) image_transposed = image_reshaped.permute(0, 1, 3, 2, 4) return image_transposed.contiguous().view(batch_size, height, width) class ChromaUpsampling(nn.Module): """Upsample chroma layers """ def __init__(self): super(ChromaUpsampling, self).__init__() def forward(self, y, cb, cr): """ Args: y(tensor): y channel image cb(tensor): cb channel cr(tensor): cr channel Returns: Tensor: batch x height x width x 3 """ def repeat(x, k=2): height, width = x.shape[1:3] x = x.unsqueeze(-1) x = x.repeat(1, 1, k, k) x = x.view(-1, height * k, width * k) return x cb = repeat(cb) cr = repeat(cr) return torch.cat([y.unsqueeze(3), cb.unsqueeze(3), cr.unsqueeze(3)], dim=3) class YCbCr2RGBJpeg(nn.Module): """Converts YCbCr image to RGB JPEG """ def __init__(self): super(YCbCr2RGBJpeg, self).__init__() matrix = np.array([[1., 0., 1.402], [1, -0.344136, -0.714136], [1, 1.772, 0]], dtype=np.float32).T self.shift = nn.Parameter(torch.tensor([0, -128., -128.])) self.matrix = nn.Parameter(torch.from_numpy(matrix)) def forward(self, image): """ Args: image(tensor): batch x height x width x 3 Returns: Tensor: batch x 3 x height x width """ result = torch.tensordot(image + self.shift, self.matrix, dims=1) return result.view(image.shape).permute(0, 3, 1, 2) class DeCompressJpeg(nn.Module): """Full JPEG decompression algorithm Args: rounding(function): rounding function to use """ def __init__(self, rounding=torch.round): super(DeCompressJpeg, self).__init__() self.c_dequantize = CDequantize() self.y_dequantize = YDequantize() self.idct = iDCT8x8() self.merging = BlockMerging() self.chroma = ChromaUpsampling() self.colors = YCbCr2RGBJpeg() def forward(self, y, cb, cr, imgh, imgw, factor=1): """ Args: compressed(dict(tensor)): batch x h*w/64 x 8 x 8 imgh(int) imgw(int) factor(float) Returns: Tensor: batch x 3 x height x width """ components = {'y': y, 'cb': cb, 'cr': cr} for k in components.keys(): if k in ('cb', 'cr'): comp = self.c_dequantize(components[k], factor=factor) height, width = int(imgh / 2), int(imgw / 2) else: comp = self.y_dequantize(components[k], factor=factor) height, width = imgh, imgw comp = self.idct(comp) components[k] = self.merging(comp, height, width) # image = self.chroma(components['y'], components['cb'], components['cr']) image = self.colors(image) image = torch.min(255 * torch.ones_like(image), torch.max(torch.zeros_like(image), image)) return image / 255 # ------------------------ main DiffJPEG ------------------------ # class DiffJPEG(nn.Module): """This JPEG algorithm result is slightly different from cv2. DiffJPEG supports batch processing. Args: differentiable(bool): If True, uses custom differentiable rounding function, if False, uses standard torch.round """ def __init__(self, differentiable=True): super(DiffJPEG, self).__init__() if differentiable: rounding = diff_round else: rounding = torch.round self.compress = CompressJpeg(rounding=rounding) self.decompress = DeCompressJpeg(rounding=rounding) def forward(self, x, quality): """ Args: x (Tensor): Input image, bchw, rgb, [0, 1] quality(float): Quality factor for jpeg compression scheme. """ factor = quality if isinstance(factor, (int, float)): factor = quality_to_factor(factor) else: for i in range(factor.size(0)): factor[i] = quality_to_factor(factor[i]) h, w = x.size()[-2:] h_pad, w_pad = 0, 0 # why should use 16 if h % 16 != 0: h_pad = 16 - h % 16 if w % 16 != 0: w_pad = 16 - w % 16 x = F.pad(x, (0, w_pad, 0, h_pad), mode='constant', value=0) y, cb, cr = self.compress(x, factor=factor) recovered = self.decompress(y, cb, cr, (h + h_pad), (w + w_pad), factor=factor) recovered = recovered[:, :, 0:h, 0:w] return recovered if __name__ == '__main__': import cv2 from bsr.utils import img2tensor, tensor2img img_gt = cv2.imread('test.png') / 255. # -------------- cv2 -------------- # encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 20] _, encimg = cv2.imencode('.jpg', img_gt * 255., encode_param) img_lq = np.float32(cv2.imdecode(encimg, 1)) cv2.imwrite('cv2_JPEG_20.png', img_lq) # -------------- DiffJPEG -------------- # jpeger = DiffJPEG(differentiable=False).cuda() img_gt = img2tensor(img_gt) img_gt = torch.stack([img_gt, img_gt]).cuda() quality = img_gt.new_tensor([20, 40]) out = jpeger(img_gt, quality=quality) cv2.imwrite('pt_JPEG_20.png', tensor2img(out[0])) cv2.imwrite('pt_JPEG_40.png', tensor2img(out[1])) ================================================ FILE: bsr/utils/dist_util.py ================================================ # Modified from https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py # noqa: E501 import functools import os import subprocess import torch import torch.distributed as dist import torch.multiprocessing as mp def init_dist(launcher, backend='nccl', **kwargs): if mp.get_start_method(allow_none=True) is None: mp.set_start_method('spawn') if launcher == 'pytorch': _init_dist_pytorch(backend, **kwargs) elif launcher == 'slurm': _init_dist_slurm(backend, **kwargs) else: raise ValueError(f'Invalid launcher type: {launcher}') def _init_dist_pytorch(backend, **kwargs): rank = int(os.environ['RANK']) num_gpus = torch.cuda.device_count() torch.cuda.set_device(rank % num_gpus) dist.init_process_group(backend=backend, **kwargs) def _init_dist_slurm(backend, port=None): """Initialize slurm distributed training environment. If argument ``port`` is not specified, then the master port will be system environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system environment variable, then a default port ``29500`` will be used. Args: backend (str): Backend of torch.distributed. port (int, optional): Master port. Defaults to None. """ proc_id = int(os.environ['SLURM_PROCID']) ntasks = int(os.environ['SLURM_NTASKS']) node_list = os.environ['SLURM_NODELIST'] num_gpus = torch.cuda.device_count() torch.cuda.set_device(proc_id % num_gpus) addr = subprocess.getoutput(f'scontrol show hostname {node_list} | head -n1') # specify master port if port is not None: os.environ['MASTER_PORT'] = str(port) elif 'MASTER_PORT' in os.environ: pass # use MASTER_PORT in the environment variable else: # 29500 is torch.distributed default port os.environ['MASTER_PORT'] = '29500' os.environ['MASTER_ADDR'] = addr os.environ['WORLD_SIZE'] = str(ntasks) os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) os.environ['RANK'] = str(proc_id) dist.init_process_group(backend=backend) def get_dist_info(): if dist.is_available(): initialized = dist.is_initialized() else: initialized = False if initialized: rank = dist.get_rank() world_size = dist.get_world_size() else: rank = 0 world_size = 1 return rank, world_size def master_only(func): @functools.wraps(func) def wrapper(*args, **kwargs): rank, _ = get_dist_info() if rank == 0: return func(*args, **kwargs) return wrapper ================================================ FILE: bsr/utils/download_util.py ================================================ import math import os import requests from torch.hub import download_url_to_file, get_dir from tqdm import tqdm from urllib.parse import urlparse from .misc import sizeof_fmt def download_file_from_google_drive(file_id, save_path): """Download files from google drive. Reference: https://stackoverflow.com/questions/25010369/wget-curl-large-file-from-google-drive Args: file_id (str): File id. save_path (str): Save path. """ session = requests.Session() URL = 'https://docs.google.com/uc?export=download' params = {'id': file_id} response = session.get(URL, params=params, stream=True) token = get_confirm_token(response) if token: params['confirm'] = token response = session.get(URL, params=params, stream=True) # get file size response_file_size = session.get(URL, params=params, stream=True, headers={'Range': 'bytes=0-2'}) if 'Content-Range' in response_file_size.headers: file_size = int(response_file_size.headers['Content-Range'].split('/')[1]) else: file_size = None save_response_content(response, save_path, file_size) def get_confirm_token(response): for key, value in response.cookies.items(): if key.startswith('download_warning'): return value return None def save_response_content(response, destination, file_size=None, chunk_size=32768): if file_size is not None: pbar = tqdm(total=math.ceil(file_size / chunk_size), unit='chunk') readable_file_size = sizeof_fmt(file_size) else: pbar = None with open(destination, 'wb') as f: downloaded_size = 0 for chunk in response.iter_content(chunk_size): downloaded_size += chunk_size if pbar is not None: pbar.update(1) pbar.set_description(f'Download {sizeof_fmt(downloaded_size)} / {readable_file_size}') if chunk: # filter out keep-alive new chunks f.write(chunk) if pbar is not None: pbar.close() def load_file_from_url(url, model_dir=None, progress=True, file_name=None): """Load file form http url, will download models if necessary. Reference: https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py Args: url (str): URL to be downloaded. model_dir (str): The path to save the downloaded model. Should be a full path. If None, use pytorch hub_dir. Default: None. progress (bool): Whether to show the download progress. Default: True. file_name (str): The downloaded file name. If None, use the file name in the url. Default: None. Returns: str: The path to the downloaded file. """ if model_dir is None: # use the pytorch hub_dir hub_dir = get_dir() model_dir = os.path.join(hub_dir, 'checkpoints') os.makedirs(model_dir, exist_ok=True) parts = urlparse(url) filename = os.path.basename(parts.path) if file_name is not None: filename = file_name cached_file = os.path.abspath(os.path.join(model_dir, filename)) if not os.path.exists(cached_file): print(f'Downloading: "{url}" to {cached_file}\n') download_url_to_file(url, cached_file, hash_prefix=None, progress=progress) return cached_file ================================================ FILE: bsr/utils/file_client.py ================================================ # Modified from https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py # noqa: E501 from abc import ABCMeta, abstractmethod class BaseStorageBackend(metaclass=ABCMeta): """Abstract class of storage backends. All backends need to implement two apis: ``get()`` and ``get_text()``. ``get()`` reads the file as a byte stream and ``get_text()`` reads the file as texts. """ @abstractmethod def get(self, filepath): pass @abstractmethod def get_text(self, filepath): pass class MemcachedBackend(BaseStorageBackend): """Memcached storage backend. Attributes: server_list_cfg (str): Config file for memcached server list. client_cfg (str): Config file for memcached client. sys_path (str | None): Additional path to be appended to `sys.path`. Default: None. """ def __init__(self, server_list_cfg, client_cfg, sys_path=None): if sys_path is not None: import sys sys.path.append(sys_path) try: import mc except ImportError: raise ImportError('Please install memcached to enable MemcachedBackend.') self.server_list_cfg = server_list_cfg self.client_cfg = client_cfg self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg, self.client_cfg) # mc.pyvector servers as a point which points to a memory cache self._mc_buffer = mc.pyvector() def get(self, filepath): filepath = str(filepath) import mc self._client.Get(filepath, self._mc_buffer) value_buf = mc.ConvertBuffer(self._mc_buffer) return value_buf def get_text(self, filepath): raise NotImplementedError class HardDiskBackend(BaseStorageBackend): """Raw hard disks storage backend.""" def get(self, filepath): filepath = str(filepath) with open(filepath, 'rb') as f: value_buf = f.read() return value_buf def get_text(self, filepath): filepath = str(filepath) with open(filepath, 'r') as f: value_buf = f.read() return value_buf class LmdbBackend(BaseStorageBackend): """Lmdb storage backend. Args: db_paths (str | list[str]): Lmdb database paths. client_keys (str | list[str]): Lmdb client keys. Default: 'default'. readonly (bool, optional): Lmdb environment parameter. If True, disallow any write operations. Default: True. lock (bool, optional): Lmdb environment parameter. If False, when concurrent access occurs, do not lock the database. Default: False. readahead (bool, optional): Lmdb environment parameter. If False, disable the OS filesystem readahead mechanism, which may improve random read performance when a database is larger than RAM. Default: False. Attributes: db_paths (list): Lmdb database path. _client (list): A list of several lmdb envs. """ def __init__(self, db_paths, client_keys='default', readonly=True, lock=False, readahead=False, **kwargs): try: import lmdb except ImportError: raise ImportError('Please install lmdb to enable LmdbBackend.') if isinstance(client_keys, str): client_keys = [client_keys] if isinstance(db_paths, list): self.db_paths = [str(v) for v in db_paths] elif isinstance(db_paths, str): self.db_paths = [str(db_paths)] assert len(client_keys) == len(self.db_paths), ('client_keys and db_paths should have the same length, ' f'but received {len(client_keys)} and {len(self.db_paths)}.') self._client = {} for client, path in zip(client_keys, self.db_paths): self._client[client] = lmdb.open(path, readonly=readonly, lock=lock, readahead=readahead, **kwargs) def get(self, filepath, client_key): """Get values according to the filepath from one lmdb named client_key. Args: filepath (str | obj:`Path`): Here, filepath is the lmdb key. client_key (str): Used for distinguishing different lmdb envs. """ filepath = str(filepath) assert client_key in self._client, (f'client_key {client_key} is not in lmdb clients.') client = self._client[client_key] with client.begin(write=False) as txn: value_buf = txn.get(filepath.encode('ascii')) return value_buf def get_text(self, filepath): raise NotImplementedError class FileClient(object): """A general file client to access files in different backend. The client loads a file or text in a specified backend from its path and return it as a binary file. it can also register other backend accessor with a given name and backend class. Attributes: backend (str): The storage backend type. Options are "disk", "memcached" and "lmdb". client (:obj:`BaseStorageBackend`): The backend object. """ _backends = { 'disk': HardDiskBackend, 'memcached': MemcachedBackend, 'lmdb': LmdbBackend, } def __init__(self, backend='disk', **kwargs): if backend not in self._backends: raise ValueError(f'Backend {backend} is not supported. Currently supported ones' f' are {list(self._backends.keys())}') self.backend = backend self.client = self._backends[backend](**kwargs) def get(self, filepath, client_key='default'): # client_key is used only for lmdb, where different fileclients have # different lmdb environments. if self.backend == 'lmdb': return self.client.get(filepath, client_key) else: return self.client.get(filepath) def get_text(self, filepath): return self.client.get_text(filepath) ================================================ FILE: bsr/utils/flow_util.py ================================================ # Modified from https://github.com/open-mmlab/mmcv/blob/master/mmcv/video/optflow.py # noqa: E501 import cv2 import numpy as np import os def flowread(flow_path, quantize=False, concat_axis=0, *args, **kwargs): """Read an optical flow map. Args: flow_path (ndarray or str): Flow path. quantize (bool): whether to read quantized pair, if set to True, remaining args will be passed to :func:`dequantize_flow`. concat_axis (int): The axis that dx and dy are concatenated, can be either 0 or 1. Ignored if quantize is False. Returns: ndarray: Optical flow represented as a (h, w, 2) numpy array """ if quantize: assert concat_axis in [0, 1] cat_flow = cv2.imread(flow_path, cv2.IMREAD_UNCHANGED) if cat_flow.ndim != 2: raise IOError(f'{flow_path} is not a valid quantized flow file, its dimension is {cat_flow.ndim}.') assert cat_flow.shape[concat_axis] % 2 == 0 dx, dy = np.split(cat_flow, 2, axis=concat_axis) flow = dequantize_flow(dx, dy, *args, **kwargs) else: with open(flow_path, 'rb') as f: try: header = f.read(4).decode('utf-8') except Exception: raise IOError(f'Invalid flow file: {flow_path}') else: if header != 'PIEH': raise IOError(f'Invalid flow file: {flow_path}, header does not contain PIEH') w = np.fromfile(f, np.int32, 1).squeeze() h = np.fromfile(f, np.int32, 1).squeeze() flow = np.fromfile(f, np.float32, w * h * 2).reshape((h, w, 2)) return flow.astype(np.float32) def flowwrite(flow, filename, quantize=False, concat_axis=0, *args, **kwargs): """Write optical flow to file. If the flow is not quantized, it will be saved as a .flo file losslessly, otherwise a jpeg image which is lossy but of much smaller size. (dx and dy will be concatenated horizontally into a single image if quantize is True.) Args: flow (ndarray): (h, w, 2) array of optical flow. filename (str): Output filepath. quantize (bool): Whether to quantize the flow and save it to 2 jpeg images. If set to True, remaining args will be passed to :func:`quantize_flow`. concat_axis (int): The axis that dx and dy are concatenated, can be either 0 or 1. Ignored if quantize is False. """ if not quantize: with open(filename, 'wb') as f: f.write('PIEH'.encode('utf-8')) np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f) flow = flow.astype(np.float32) flow.tofile(f) f.flush() else: assert concat_axis in [0, 1] dx, dy = quantize_flow(flow, *args, **kwargs) dxdy = np.concatenate((dx, dy), axis=concat_axis) os.makedirs(os.path.dirname(filename), exist_ok=True) cv2.imwrite(filename, dxdy) def quantize_flow(flow, max_val=0.02, norm=True): """Quantize flow to [0, 255]. After this step, the size of flow will be much smaller, and can be dumped as jpeg images. Args: flow (ndarray): (h, w, 2) array of optical flow. max_val (float): Maximum value of flow, values beyond [-max_val, max_val] will be truncated. norm (bool): Whether to divide flow values by image width/height. Returns: tuple[ndarray]: Quantized dx and dy. """ h, w, _ = flow.shape dx = flow[..., 0] dy = flow[..., 1] if norm: dx = dx / w # avoid inplace operations dy = dy / h # use 255 levels instead of 256 to make sure 0 is 0 after dequantization. flow_comps = [quantize(d, -max_val, max_val, 255, np.uint8) for d in [dx, dy]] return tuple(flow_comps) def dequantize_flow(dx, dy, max_val=0.02, denorm=True): """Recover from quantized flow. Args: dx (ndarray): Quantized dx. dy (ndarray): Quantized dy. max_val (float): Maximum value used when quantizing. denorm (bool): Whether to multiply flow values with width/height. Returns: ndarray: Dequantized flow. """ assert dx.shape == dy.shape assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1) dx, dy = [dequantize(d, -max_val, max_val, 255) for d in [dx, dy]] if denorm: dx *= dx.shape[1] dy *= dx.shape[0] flow = np.dstack((dx, dy)) return flow def quantize(arr, min_val, max_val, levels, dtype=np.int64): """Quantize an array of (-inf, inf) to [0, levels-1]. Args: arr (ndarray): Input array. min_val (scalar): Minimum value to be clipped. max_val (scalar): Maximum value to be clipped. levels (int): Quantization levels. dtype (np.type): The type of the quantized array. Returns: tuple: Quantized array. """ if not (isinstance(levels, int) and levels > 1): raise ValueError(f'levels must be a positive integer, but got {levels}') if min_val >= max_val: raise ValueError(f'min_val ({min_val}) must be smaller than max_val ({max_val})') arr = np.clip(arr, min_val, max_val) - min_val quantized_arr = np.minimum(np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1) return quantized_arr def dequantize(arr, min_val, max_val, levels, dtype=np.float64): """Dequantize an array. Args: arr (ndarray): Input array. min_val (scalar): Minimum value to be clipped. max_val (scalar): Maximum value to be clipped. levels (int): Quantization levels. dtype (np.type): The type of the dequantized array. Returns: tuple: Dequantized array. """ if not (isinstance(levels, int) and levels > 1): raise ValueError(f'levels must be a positive integer, but got {levels}') if min_val >= max_val: raise ValueError(f'min_val ({min_val}) must be smaller than max_val ({max_val})') dequantized_arr = (arr + 0.5).astype(dtype) * (max_val - min_val) / levels + min_val return dequantized_arr ================================================ FILE: bsr/utils/img_process_util.py ================================================ import cv2 import numpy as np import torch from torch.nn import functional as F def filter2D(img, kernel): """PyTorch version of cv2.filter2D Args: img (Tensor): (b, c, h, w) kernel (Tensor): (b, k, k) """ k = kernel.size(-1) b, c, h, w = img.size() if k % 2 == 1: img = F.pad(img, (k // 2, k // 2, k // 2, k // 2), mode='reflect') else: raise ValueError('Wrong kernel size') ph, pw = img.size()[-2:] if kernel.size(0) == 1: # apply the same kernel to all batch images img = img.view(b * c, 1, ph, pw) kernel = kernel.view(1, 1, k, k) return F.conv2d(img, kernel, padding=0).view(b, c, h, w) else: img = img.view(1, b * c, ph, pw) kernel = kernel.view(b, 1, k, k).repeat(1, c, 1, 1).view(b * c, 1, k, k) return F.conv2d(img, kernel, groups=b * c).view(b, c, h, w) def usm_sharp(img, weight=0.5, radius=50, threshold=10): """USM sharpening. Input image: I; Blurry image: B. 1. sharp = I + weight * (I - B) 2. Mask = 1 if abs(I - B) > threshold, else: 0 3. Blur mask: 4. Out = Mask * sharp + (1 - Mask) * I Args: img (Numpy array): Input image, HWC, BGR; float32, [0, 1]. weight (float): Sharp weight. Default: 1. radius (float): Kernel size of Gaussian blur. Default: 50. threshold (int): """ if radius % 2 == 0: radius += 1 blur = cv2.GaussianBlur(img, (radius, radius), 0) residual = img - blur mask = np.abs(residual) * 255 > threshold mask = mask.astype('float32') soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0) sharp = img + weight * residual sharp = np.clip(sharp, 0, 1) return soft_mask * sharp + (1 - soft_mask) * img class USMSharp(torch.nn.Module): def __init__(self, radius=50, sigma=0): super(USMSharp, self).__init__() if radius % 2 == 0: radius += 1 self.radius = radius kernel = cv2.getGaussianKernel(radius, sigma) kernel = torch.FloatTensor(np.dot(kernel, kernel.transpose())).unsqueeze_(0) self.register_buffer('kernel', kernel) def forward(self, img, weight=0.5, threshold=10): blur = filter2D(img, self.kernel) residual = img - blur mask = torch.abs(residual) * 255 > threshold mask = mask.float() soft_mask = filter2D(mask, self.kernel) sharp = img + weight * residual sharp = torch.clip(sharp, 0, 1) return soft_mask * sharp + (1 - soft_mask) * img ================================================ FILE: bsr/utils/img_util.py ================================================ import cv2 import math import numpy as np import os import torch from torchvision.utils import make_grid def img2tensor(imgs, bgr2rgb=True, float32=True): """Numpy array to tensor. Args: imgs (list[ndarray] | ndarray): Input images. bgr2rgb (bool): Whether to change bgr to rgb. float32 (bool): Whether to change to float32. Returns: list[tensor] | tensor: Tensor images. If returned results only have one element, just return tensor. """ def _totensor(img, bgr2rgb, float32): if img.shape[2] == 3 and bgr2rgb: if img.dtype == 'float64': img = img.astype('float32') img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = torch.from_numpy(img.transpose(2, 0, 1)) if float32: img = img.float() return img if isinstance(imgs, list): return [_totensor(img, bgr2rgb, float32) for img in imgs] else: return _totensor(imgs, bgr2rgb, float32) def tensor2img(tensor, rgb2bgr=True, out_type=np.uint8, min_max=(0, 1)): """Convert torch Tensors into image numpy arrays. After clamping to [min, max], values will be normalized to [0, 1]. Args: tensor (Tensor or list[Tensor]): Accept shapes: 1) 4D mini-batch Tensor of shape (B x 3/1 x H x W); 2) 3D Tensor of shape (3/1 x H x W); 3) 2D Tensor of shape (H x W). Tensor channel should be in RGB order. rgb2bgr (bool): Whether to change rgb to bgr. out_type (numpy type): output types. If ``np.uint8``, transform outputs to uint8 type with range [0, 255]; otherwise, float type with range [0, 1]. Default: ``np.uint8``. min_max (tuple[int]): min and max values for clamp. Returns: (Tensor or list): 3D ndarray of shape (H x W x C) OR 2D ndarray of shape (H x W). The channel order is BGR. """ if not (torch.is_tensor(tensor) or (isinstance(tensor, list) and all(torch.is_tensor(t) for t in tensor))): raise TypeError(f'tensor or list of tensors expected, got {type(tensor)}') if torch.is_tensor(tensor): tensor = [tensor] result = [] for _tensor in tensor: _tensor = _tensor.squeeze(0).float().detach().cpu().clamp_(*min_max) _tensor = (_tensor - min_max[0]) / (min_max[1] - min_max[0]) n_dim = _tensor.dim() if n_dim == 4: img_np = make_grid(_tensor, nrow=int(math.sqrt(_tensor.size(0))), normalize=False).numpy() img_np = img_np.transpose(1, 2, 0) if rgb2bgr: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR) elif n_dim == 3: img_np = _tensor.numpy() img_np = img_np.transpose(1, 2, 0) if img_np.shape[2] == 1: # gray image img_np = np.squeeze(img_np, axis=2) else: if rgb2bgr: img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR) elif n_dim == 2: img_np = _tensor.numpy() else: raise TypeError(f'Only support 4D, 3D or 2D tensor. But received with dimension: {n_dim}') if out_type == np.uint8: # Unlike MATLAB, numpy.unit8() WILL NOT round by default. img_np = (img_np * 255.0).round() img_np = img_np.astype(out_type) result.append(img_np) if len(result) == 1: result = result[0] return result def tensor2img_fast(tensor, rgb2bgr=True, min_max=(0, 1)): """This implementation is slightly faster than tensor2img. It now only supports torch tensor with shape (1, c, h, w). Args: tensor (Tensor): Now only support torch tensor with (1, c, h, w). rgb2bgr (bool): Whether to change rgb to bgr. Default: True. min_max (tuple[int]): min and max values for clamp. """ output = tensor.squeeze(0).detach().clamp_(*min_max).permute(1, 2, 0) output = (output - min_max[0]) / (min_max[1] - min_max[0]) * 255 output = output.type(torch.uint8).cpu().numpy() if rgb2bgr: output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) return output def imfrombytes(content, flag='color', float32=False): """Read an image from bytes. Args: content (bytes): Image bytes got from files or other streams. flag (str): Flags specifying the color type of a loaded image, candidates are `color`, `grayscale` and `unchanged`. float32 (bool): Whether to change to float32., If True, will also norm to [0, 1]. Default: False. Returns: ndarray: Loaded image array. """ img_np = np.frombuffer(content, np.uint8) imread_flags = {'color': cv2.IMREAD_COLOR, 'grayscale': cv2.IMREAD_GRAYSCALE, 'unchanged': cv2.IMREAD_UNCHANGED} img = cv2.imdecode(img_np, imread_flags[flag]) if float32: img = img.astype(np.float32) / 255. return img def imwrite(img, file_path, params=None, auto_mkdir=True): """Write image to file. Args: img (ndarray): Image array to be written. file_path (str): Image file path. params (None or list): Same as opencv's :func:`imwrite` interface. auto_mkdir (bool): If the parent folder of `file_path` does not exist, whether to create it automatically. Returns: bool: Successful or not. """ if auto_mkdir: dir_name = os.path.abspath(os.path.dirname(file_path)) os.makedirs(dir_name, exist_ok=True) ok = cv2.imwrite(file_path, img, params) if not ok: raise IOError('Failed in writing images.') def crop_border(imgs, crop_border): """Crop borders of images. Args: imgs (list[ndarray] | ndarray): Images with shape (h, w, c). crop_border (int): Crop border for each end of height and weight. Returns: list[ndarray]: Cropped images. """ if crop_border == 0: return imgs else: if isinstance(imgs, list): return [v[crop_border:-crop_border, crop_border:-crop_border, ...] for v in imgs] else: return imgs[crop_border:-crop_border, crop_border:-crop_border, ...] ================================================ FILE: bsr/utils/lmdb_util.py ================================================ import cv2 import lmdb import sys from multiprocessing import Pool from os import path as osp from tqdm import tqdm def make_lmdb_from_imgs(data_path, lmdb_path, img_path_list, keys, batch=5000, compress_level=1, multiprocessing_read=False, n_thread=40, map_size=None): """Make lmdb from images. Contents of lmdb. The file structure is: :: example.lmdb ├── data.mdb ├── lock.mdb ├── meta_info.txt The data.mdb and lock.mdb are standard lmdb files and you can refer to https://lmdb.readthedocs.io/en/release/ for more details. The meta_info.txt is a specified txt file to record the meta information of our datasets. It will be automatically created when preparing datasets by our provided dataset tools. Each line in the txt file records 1)image name (with extension), 2)image shape, and 3)compression level, separated by a white space. For example, the meta information could be: `000_00000000.png (720,1280,3) 1`, which means: 1) image name (with extension): 000_00000000.png; 2) image shape: (720,1280,3); 3) compression level: 1 We use the image name without extension as the lmdb key. If `multiprocessing_read` is True, it will read all the images to memory using multiprocessing. Thus, your server needs to have enough memory. Args: data_path (str): Data path for reading images. lmdb_path (str): Lmdb save path. img_path_list (str): Image path list. keys (str): Used for lmdb keys. batch (int): After processing batch images, lmdb commits. Default: 5000. compress_level (int): Compress level when encoding images. Default: 1. multiprocessing_read (bool): Whether use multiprocessing to read all the images to memory. Default: False. n_thread (int): For multiprocessing. map_size (int | None): Map size for lmdb env. If None, use the estimated size from images. Default: None """ assert len(img_path_list) == len(keys), ('img_path_list and keys should have the same length, ' f'but got {len(img_path_list)} and {len(keys)}') print(f'Create lmdb for {data_path}, save to {lmdb_path}...') print(f'Totoal images: {len(img_path_list)}') if not lmdb_path.endswith('.lmdb'): raise ValueError("lmdb_path must end with '.lmdb'.") if osp.exists(lmdb_path): print(f'Folder {lmdb_path} already exists. Exit.') sys.exit(1) if multiprocessing_read: # read all the images to memory (multiprocessing) dataset = {} # use dict to keep the order for multiprocessing shapes = {} print(f'Read images with multiprocessing, #thread: {n_thread} ...') pbar = tqdm(total=len(img_path_list), unit='image') def callback(arg): """get the image data and update pbar.""" key, dataset[key], shapes[key] = arg pbar.update(1) pbar.set_description(f'Read {key}') pool = Pool(n_thread) for path, key in zip(img_path_list, keys): pool.apply_async(read_img_worker, args=(osp.join(data_path, path), key, compress_level), callback=callback) pool.close() pool.join() pbar.close() print(f'Finish reading {len(img_path_list)} images.') # create lmdb environment if map_size is None: # obtain data size for one image img = cv2.imread(osp.join(data_path, img_path_list[0]), cv2.IMREAD_UNCHANGED) _, img_byte = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, compress_level]) data_size_per_img = img_byte.nbytes print('Data size per image is: ', data_size_per_img) data_size = data_size_per_img * len(img_path_list) map_size = data_size * 10 env = lmdb.open(lmdb_path, map_size=map_size) # write data to lmdb pbar = tqdm(total=len(img_path_list), unit='chunk') txn = env.begin(write=True) txt_file = open(osp.join(lmdb_path, 'meta_info.txt'), 'w') for idx, (path, key) in enumerate(zip(img_path_list, keys)): pbar.update(1) pbar.set_description(f'Write {key}') key_byte = key.encode('ascii') if multiprocessing_read: img_byte = dataset[key] h, w, c = shapes[key] else: _, img_byte, img_shape = read_img_worker(osp.join(data_path, path), key, compress_level) h, w, c = img_shape txn.put(key_byte, img_byte) # write meta information txt_file.write(f'{key}.png ({h},{w},{c}) {compress_level}\n') if idx % batch == 0: txn.commit() txn = env.begin(write=True) pbar.close() txn.commit() env.close() txt_file.close() print('\nFinish writing lmdb.') def read_img_worker(path, key, compress_level): """Read image worker. Args: path (str): Image path. key (str): Image key. compress_level (int): Compress level when encoding images. Returns: str: Image key. byte: Image byte. tuple[int]: Image shape. """ img = cv2.imread(path, cv2.IMREAD_UNCHANGED) if img.ndim == 2: h, w = img.shape c = 1 else: h, w, c = img.shape _, img_byte = cv2.imencode('.png', img, [cv2.IMWRITE_PNG_COMPRESSION, compress_level]) return (key, img_byte, (h, w, c)) class LmdbMaker(): """LMDB Maker. Args: lmdb_path (str): Lmdb save path. map_size (int): Map size for lmdb env. Default: 1024 ** 4, 1TB. batch (int): After processing batch images, lmdb commits. Default: 5000. compress_level (int): Compress level when encoding images. Default: 1. """ def __init__(self, lmdb_path, map_size=1024**4, batch=5000, compress_level=1): if not lmdb_path.endswith('.lmdb'): raise ValueError("lmdb_path must end with '.lmdb'.") if osp.exists(lmdb_path): print(f'Folder {lmdb_path} already exists. Exit.') sys.exit(1) self.lmdb_path = lmdb_path self.batch = batch self.compress_level = compress_level self.env = lmdb.open(lmdb_path, map_size=map_size) self.txn = self.env.begin(write=True) self.txt_file = open(osp.join(lmdb_path, 'meta_info.txt'), 'w') self.counter = 0 def put(self, img_byte, key, img_shape): self.counter += 1 key_byte = key.encode('ascii') self.txn.put(key_byte, img_byte) # write meta information h, w, c = img_shape self.txt_file.write(f'{key}.png ({h},{w},{c}) {self.compress_level}\n') if self.counter % self.batch == 0: self.txn.commit() self.txn = self.env.begin(write=True) def close(self): self.txn.commit() self.env.close() self.txt_file.close() ================================================ FILE: bsr/utils/logger.py ================================================ import datetime import logging import time from .dist_util import get_dist_info, master_only initialized_logger = {} class AvgTimer(): def __init__(self, window=200): self.window = window # average window self.current_time = 0 self.total_time = 0 self.count = 0 self.avg_time = 0 self.start() def start(self): self.start_time = self.tic = time.time() def record(self): self.count += 1 self.toc = time.time() self.current_time = self.toc - self.tic self.total_time += self.current_time # calculate average time self.avg_time = self.total_time / self.count # reset if self.count > self.window: self.count = 0 self.total_time = 0 self.tic = time.time() def get_current_time(self): return self.current_time def get_avg_time(self): return self.avg_time class MessageLogger(): """Message logger for printing. Args: opt (dict): Config. It contains the following keys: name (str): Exp name. logger (dict): Contains 'print_freq' (str) for logger interval. train (dict): Contains 'total_iter' (int) for total iters. use_tb_logger (bool): Use tensorboard logger. start_iter (int): Start iter. Default: 1. tb_logger (obj:`tb_logger`): Tensorboard logger. Default: None. """ def __init__(self, opt, start_iter=1, tb_logger=None): self.exp_name = opt['name'] self.interval = opt['logger']['print_freq'] self.start_iter = start_iter self.max_iters = opt['train']['total_iter'] self.use_tb_logger = opt['logger']['use_tb_logger'] self.tb_logger = tb_logger self.start_time = time.time() self.logger = get_root_logger() def reset_start_time(self): self.start_time = time.time() @master_only def __call__(self, log_vars): """Format logging message. Args: log_vars (dict): It contains the following keys: epoch (int): Epoch number. iter (int): Current iter. lrs (list): List for learning rates. time (float): Iter time. data_time (float): Data time for each iter. """ # epoch, iter, learning rates epoch = log_vars.pop('epoch') current_iter = log_vars.pop('iter') lrs = log_vars.pop('lrs') message = (f'[{self.exp_name[:5]}..][epoch:{epoch:3d}, iter:{current_iter:8,d}, lr:(') for v in lrs: message += f'{v:.3e},' message += ')] ' # time and estimated time if 'time' in log_vars.keys(): iter_time = log_vars.pop('time') data_time = log_vars.pop('data_time') total_time = time.time() - self.start_time time_sec_avg = total_time / (current_iter - self.start_iter + 1) eta_sec = time_sec_avg * (self.max_iters - current_iter - 1) eta_str = str(datetime.timedelta(seconds=int(eta_sec))) message += f'[eta: {eta_str}, ' message += f'time (data): {iter_time:.3f} ({data_time:.3f})] ' # other items, especially losses for k, v in log_vars.items(): message += f'{k}: {v:.4e} ' # tensorboard logger if self.use_tb_logger and 'debug' not in self.exp_name: if k.startswith('l_'): self.tb_logger.add_scalar(f'losses/{k}', v, current_iter) else: self.tb_logger.add_scalar(k, v, current_iter) self.logger.info(message) @master_only def init_tb_logger(log_dir): from torch.utils.tensorboard import SummaryWriter tb_logger = SummaryWriter(log_dir=log_dir) return tb_logger @master_only def init_wandb_logger(opt): """We now only use wandb to sync tensorboard log.""" import wandb logger = get_root_logger() project = opt['logger']['wandb']['project'] resume_id = opt['logger']['wandb'].get('resume_id') if resume_id: wandb_id = resume_id resume = 'allow' logger.warning(f'Resume wandb logger with id={wandb_id}.') else: wandb_id = wandb.util.generate_id() resume = 'never' wandb.init(id=wandb_id, resume=resume, name=opt['name'], config=opt, project=project, sync_tensorboard=True) logger.info(f'Use wandb logger with id={wandb_id}; project={project}.') def get_root_logger(logger_name='basicsr', log_level=logging.INFO, log_file=None): """Get the root logger. The logger will be initialized if it has not been initialized. By default a StreamHandler will be added. If `log_file` is specified, a FileHandler will also be added. Args: logger_name (str): root logger name. Default: 'basicsr'. log_file (str | None): The log filename. If specified, a FileHandler will be added to the root logger. log_level (int): The root logger level. Note that only the process of rank 0 is affected, while other processes will set the level to "Error" and be silent most of the time. Returns: logging.Logger: The root logger. """ logger = logging.getLogger(logger_name) # if the logger has been initialized, just return it if logger_name in initialized_logger: return logger format_str = '%(asctime)s %(levelname)s: %(message)s' stream_handler = logging.StreamHandler() stream_handler.setFormatter(logging.Formatter(format_str)) logger.addHandler(stream_handler) logger.propagate = False rank, _ = get_dist_info() if rank != 0: logger.setLevel('ERROR') elif log_file is not None: logger.setLevel(log_level) # add file handler file_handler = logging.FileHandler(log_file, 'w') file_handler.setFormatter(logging.Formatter(format_str)) file_handler.setLevel(log_level) logger.addHandler(file_handler) initialized_logger[logger_name] = True return logger def get_env_info(): """Get environment information. Currently, only log the software version. """ import torch import torchvision from basicsr.version import __version__ msg = r""" ____ _ _____ ____ / __ ) ____ _ _____ (_)_____/ ___/ / __ \ / __ |/ __ `// ___// // ___/\__ \ / /_/ / / /_/ // /_/ /(__ )/ // /__ ___/ // _, _/ /_____/ \__,_//____//_/ \___//____//_/ |_| ______ __ __ __ __ / ____/____ ____ ____/ / / / __ __ _____ / /__ / / / / __ / __ \ / __ \ / __ / / / / / / // ___// //_/ / / / /_/ // /_/ // /_/ // /_/ / / /___/ /_/ // /__ / /< /_/ \____/ \____/ \____/ \____/ /_____/\____/ \___//_/|_| (_) """ msg += ('\nVersion Information: ' f'\n\tBasicSR: {__version__}' f'\n\tPyTorch: {torch.__version__}' f'\n\tTorchVision: {torchvision.__version__}') return msg ================================================ FILE: bsr/utils/matlab_functions.py ================================================ import math import numpy as np import torch def cubic(x): """cubic function used for calculate_weights_indices.""" absx = torch.abs(x) absx2 = absx**2 absx3 = absx**3 return (1.5 * absx3 - 2.5 * absx2 + 1) * ( (absx <= 1).type_as(absx)) + (-0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2) * (((absx > 1) * (absx <= 2)).type_as(absx)) def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing): """Calculate weights and indices, used for imresize function. Args: in_length (int): Input length. out_length (int): Output length. scale (float): Scale factor. kernel_width (int): Kernel width. antialisaing (bool): Whether to apply anti-aliasing when downsampling. """ if (scale < 1) and antialiasing: # Use a modified kernel (larger kernel width) to simultaneously # interpolate and antialias kernel_width = kernel_width / scale # Output-space coordinates x = torch.linspace(1, out_length, out_length) # Input-space coordinates. Calculate the inverse mapping such that 0.5 # in output space maps to 0.5 in input space, and 0.5 + scale in output # space maps to 1.5 in input space. u = x / scale + 0.5 * (1 - 1 / scale) # What is the left-most pixel that can be involved in the computation? left = torch.floor(u - kernel_width / 2) # What is the maximum number of pixels that can be involved in the # computation? Note: it's OK to use an extra pixel here; if the # corresponding weights are all zero, it will be eliminated at the end # of this function. p = math.ceil(kernel_width) + 2 # The indices of the input pixels involved in computing the k-th output # pixel are in row k of the indices matrix. indices = left.view(out_length, 1).expand(out_length, p) + torch.linspace(0, p - 1, p).view(1, p).expand( out_length, p) # The weights used to compute the k-th output pixel are in row k of the # weights matrix. distance_to_center = u.view(out_length, 1).expand(out_length, p) - indices # apply cubic kernel if (scale < 1) and antialiasing: weights = scale * cubic(distance_to_center * scale) else: weights = cubic(distance_to_center) # Normalize the weights matrix so that each row sums to 1. weights_sum = torch.sum(weights, 1).view(out_length, 1) weights = weights / weights_sum.expand(out_length, p) # If a column in weights is all zero, get rid of it. only consider the # first and last column. weights_zero_tmp = torch.sum((weights == 0), 0) if not math.isclose(weights_zero_tmp[0], 0, rel_tol=1e-6): indices = indices.narrow(1, 1, p - 2) weights = weights.narrow(1, 1, p - 2) if not math.isclose(weights_zero_tmp[-1], 0, rel_tol=1e-6): indices = indices.narrow(1, 0, p - 2) weights = weights.narrow(1, 0, p - 2) weights = weights.contiguous() indices = indices.contiguous() sym_len_s = -indices.min() + 1 sym_len_e = indices.max() - in_length indices = indices + sym_len_s - 1 return weights, indices, int(sym_len_s), int(sym_len_e) @torch.no_grad() def imresize(img, scale, antialiasing=True): """imresize function same as MATLAB. It now only supports bicubic. The same scale applies for both height and width. Args: img (Tensor | Numpy array): Tensor: Input image with shape (c, h, w), [0, 1] range. Numpy: Input image with shape (h, w, c), [0, 1] range. scale (float): Scale factor. The same scale applies for both height and width. antialisaing (bool): Whether to apply anti-aliasing when downsampling. Default: True. Returns: Tensor: Output image with shape (c, h, w), [0, 1] range, w/o round. """ squeeze_flag = False if type(img).__module__ == np.__name__: # numpy type numpy_type = True if img.ndim == 2: img = img[:, :, None] squeeze_flag = True img = torch.from_numpy(img.transpose(2, 0, 1)).float() else: numpy_type = False if img.ndim == 2: img = img.unsqueeze(0) squeeze_flag = True in_c, in_h, in_w = img.size() out_h, out_w = math.ceil(in_h * scale), math.ceil(in_w * scale) kernel_width = 4 kernel = 'cubic' # get weights and indices weights_h, indices_h, sym_len_hs, sym_len_he = calculate_weights_indices(in_h, out_h, scale, kernel, kernel_width, antialiasing) weights_w, indices_w, sym_len_ws, sym_len_we = calculate_weights_indices(in_w, out_w, scale, kernel, kernel_width, antialiasing) # process H dimension # symmetric copying img_aug = torch.FloatTensor(in_c, in_h + sym_len_hs + sym_len_he, in_w) img_aug.narrow(1, sym_len_hs, in_h).copy_(img) sym_patch = img[:, :sym_len_hs, :] inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(1, inv_idx) img_aug.narrow(1, 0, sym_len_hs).copy_(sym_patch_inv) sym_patch = img[:, -sym_len_he:, :] inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(1, inv_idx) img_aug.narrow(1, sym_len_hs + in_h, sym_len_he).copy_(sym_patch_inv) out_1 = torch.FloatTensor(in_c, out_h, in_w) kernel_width = weights_h.size(1) for i in range(out_h): idx = int(indices_h[i][0]) for j in range(in_c): out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(0, 1).mv(weights_h[i]) # process W dimension # symmetric copying out_1_aug = torch.FloatTensor(in_c, out_h, in_w + sym_len_ws + sym_len_we) out_1_aug.narrow(2, sym_len_ws, in_w).copy_(out_1) sym_patch = out_1[:, :, :sym_len_ws] inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(2, inv_idx) out_1_aug.narrow(2, 0, sym_len_ws).copy_(sym_patch_inv) sym_patch = out_1[:, :, -sym_len_we:] inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(2, inv_idx) out_1_aug.narrow(2, sym_len_ws + in_w, sym_len_we).copy_(sym_patch_inv) out_2 = torch.FloatTensor(in_c, out_h, out_w) kernel_width = weights_w.size(1) for i in range(out_w): idx = int(indices_w[i][0]) for j in range(in_c): out_2[j, :, i] = out_1_aug[j, :, idx:idx + kernel_width].mv(weights_w[i]) if squeeze_flag: out_2 = out_2.squeeze(0) if numpy_type: out_2 = out_2.numpy() if not squeeze_flag: out_2 = out_2.transpose(1, 2, 0) return out_2 ================================================ FILE: bsr/utils/misc.py ================================================ import numpy as np import os import random import time import torch from os import path as osp from .dist_util import master_only def set_random_seed(seed): """Set random seeds.""" random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) def get_time_str(): return time.strftime('%Y%m%d_%H%M%S', time.localtime()) def mkdir_and_rename(path): """mkdirs. If path exists, rename it with timestamp and create a new one. Args: path (str): Folder path. """ if osp.exists(path): new_name = path + '_archived_' + get_time_str() print(f'Path already exists. Rename it to {new_name}', flush=True) os.rename(path, new_name) os.makedirs(path, exist_ok=True) @master_only def make_exp_dirs(opt): """Make dirs for experiments.""" path_opt = opt['path'].copy() if opt['is_train']: mkdir_and_rename(path_opt.pop('experiments_root')) else: mkdir_and_rename(path_opt.pop('results_root')) for key, path in path_opt.items(): if ('strict_load' in key) or ('pretrain_network' in key) or ('resume' in key) or ('param_key' in key): continue else: os.makedirs(path, exist_ok=True) def scandir(dir_path, suffix=None, recursive=False, full_path=False): """Scan a directory to find the interested files. Args: dir_path (str): Path of the directory. suffix (str | tuple(str), optional): File suffix that we are interested in. Default: None. recursive (bool, optional): If set to True, recursively scan the directory. Default: False. full_path (bool, optional): If set to True, include the dir_path. Default: False. Returns: A generator for all the interested files with relative paths. """ if (suffix is not None) and not isinstance(suffix, (str, tuple)): raise TypeError('"suffix" must be a string or tuple of strings') root = dir_path def _scandir(dir_path, suffix, recursive): for entry in os.scandir(dir_path): if not entry.name.startswith('.') and entry.is_file(): if full_path: return_path = entry.path else: return_path = osp.relpath(entry.path, root) if suffix is None: yield return_path elif return_path.endswith(suffix): yield return_path else: if recursive: yield from _scandir(entry.path, suffix=suffix, recursive=recursive) else: continue return _scandir(dir_path, suffix=suffix, recursive=recursive) def check_resume(opt, resume_iter): """Check resume states and pretrain_network paths. Args: opt (dict): Options. resume_iter (int): Resume iteration. """ if opt['path']['resume_state']: # get all the networks networks = [key for key in opt.keys() if key.startswith('network_')] flag_pretrain = False for network in networks: if opt['path'].get(f'pretrain_{network}') is not None: flag_pretrain = True if flag_pretrain: print('pretrain_network path will be ignored during resuming.') # set pretrained model paths for network in networks: name = f'pretrain_{network}' basename = network.replace('network_', '') if opt['path'].get('ignore_resume_networks') is None or (network not in opt['path']['ignore_resume_networks']): opt['path'][name] = osp.join(opt['path']['models'], f'net_{basename}_{resume_iter}.pth') print(f"Set {name} to {opt['path'][name]}") # change param_key to params in resume param_keys = [key for key in opt['path'].keys() if key.startswith('param_key')] for param_key in param_keys: if opt['path'][param_key] == 'params_ema': opt['path'][param_key] = 'params' print(f'Set {param_key} to params') def sizeof_fmt(size, suffix='B'): """Get human readable file size. Args: size (int): File size. suffix (str): Suffix. Default: 'B'. Return: str: Formatted file size. """ for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: if abs(size) < 1024.0: return f'{size:3.1f} {unit}{suffix}' size /= 1024.0 return f'{size:3.1f} Y{suffix}' ================================================ FILE: bsr/utils/options.py ================================================ import argparse import os import random import torch import yaml from collections import OrderedDict from os import path as osp from bsr.utils import set_random_seed from bsr.utils.dist_util import get_dist_info, init_dist, master_only def ordered_yaml(): """Support OrderedDict for yaml. Returns: tuple: yaml Loader and Dumper. """ try: from yaml import CDumper as Dumper from yaml import CLoader as Loader except ImportError: from yaml import Dumper, Loader _mapping_tag = yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG def dict_representer(dumper, data): return dumper.represent_dict(data.items()) def dict_constructor(loader, node): return OrderedDict(loader.construct_pairs(node)) Dumper.add_representer(OrderedDict, dict_representer) Loader.add_constructor(_mapping_tag, dict_constructor) return Loader, Dumper def yaml_load(f): """Load yaml file or string. Args: f (str): File path or a python string. Returns: dict: Loaded dict. """ if os.path.isfile(f): with open(f, 'r') as f: return yaml.load(f, Loader=ordered_yaml()[0]) else: return yaml.load(f, Loader=ordered_yaml()[0]) def dict2str(opt, indent_level=1): """dict to string for printing options. Args: opt (dict): Option dict. indent_level (int): Indent level. Default: 1. Return: (str): Option string for printing. """ msg = '\n' for k, v in opt.items(): if isinstance(v, dict): msg += ' ' * (indent_level * 2) + k + ':[' msg += dict2str(v, indent_level + 1) msg += ' ' * (indent_level * 2) + ']\n' else: msg += ' ' * (indent_level * 2) + k + ': ' + str(v) + '\n' return msg def _postprocess_yml_value(value): # None if value == '~' or value.lower() == 'none': return None # bool if value.lower() == 'true': return True elif value.lower() == 'false': return False # !!float number if value.startswith('!!float'): return float(value.replace('!!float', '')) # number if value.isdigit(): return int(value) elif value.replace('.', '', 1).isdigit() and value.count('.') < 2: return float(value) # list if value.startswith('['): return eval(value) # str return value def parse_options(root_path, is_train=True): parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, required=True, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch', 'slurm'], default='none', help='job launcher') parser.add_argument('--auto_resume', action='store_true') parser.add_argument('--debug', action='store_true') parser.add_argument('--local_rank', type=int, default=0) parser.add_argument( '--force_yml', nargs='+', default=None, help='Force to update yml files. Examples: train:ema_decay=0.999') args = parser.parse_args() # parse yml to dict opt = yaml_load(args.opt) # distributed settings if args.launcher == 'none': opt['dist'] = False print('Disable distributed.', flush=True) else: opt['dist'] = True if args.launcher == 'slurm' and 'dist_params' in opt: init_dist(args.launcher, **opt['dist_params']) else: init_dist(args.launcher) opt['rank'], opt['world_size'] = get_dist_info() # random seed seed = opt.get('manual_seed') if seed is None: seed = random.randint(1, 10000) opt['manual_seed'] = seed set_random_seed(seed + opt['rank']) # force to update yml options if args.force_yml is not None: for entry in args.force_yml: # now do not support creating new keys keys, value = entry.split('=') keys, value = keys.strip(), value.strip() value = _postprocess_yml_value(value) eval_str = 'opt' for key in keys.split(':'): eval_str += f'["{key}"]' eval_str += '=value' # using exec function exec(eval_str) opt['auto_resume'] = args.auto_resume opt['is_train'] = is_train # debug setting if args.debug and not opt['name'].startswith('debug'): opt['name'] = 'debug_' + opt['name'] if opt['num_gpu'] == 'auto': opt['num_gpu'] = torch.cuda.device_count() # datasets for phase, dataset in opt['datasets'].items(): # for multiple datasets, e.g., val_1, val_2; test_1, test_2 phase = phase.split('_')[0] dataset['phase'] = phase if 'scale' in opt: dataset['scale'] = opt['scale'] if dataset.get('dataroot_gt') is not None: dataset['dataroot_gt'] = osp.expanduser(dataset['dataroot_gt']) if dataset.get('dataroot_lq') is not None: dataset['dataroot_lq'] = osp.expanduser(dataset['dataroot_lq']) # paths for key, val in opt['path'].items(): if (val is not None) and ('resume_state' in key or 'pretrain_network' in key): opt['path'][key] = osp.expanduser(val) if is_train: experiments_root = opt['path'].get('experiments_root') if experiments_root is None: experiments_root = osp.join(root_path, 'experiments') experiments_root = osp.join(experiments_root, opt['name']) opt['path']['experiments_root'] = experiments_root opt['path']['models'] = osp.join(experiments_root, 'models') opt['path']['training_states'] = osp.join(experiments_root, 'training_states') opt['path']['log'] = experiments_root opt['path']['visualization'] = osp.join(experiments_root, 'visualization') # change some options for debug mode if 'debug' in opt['name']: if 'val' in opt: opt['val']['val_freq'] = 8 opt['logger']['print_freq'] = 1 opt['logger']['save_checkpoint_freq'] = 8 else: # test results_root = opt['path'].get('results_root') if results_root is None: results_root = osp.join(root_path, 'results') results_root = osp.join(results_root, opt['name']) opt['path']['results_root'] = results_root opt['path']['log'] = results_root opt['path']['visualization'] = osp.join(results_root, 'visualization') return opt, args @master_only def copy_opt_file(opt_file, experiments_root): # copy the yml file to the experiment root import sys import time from shutil import copyfile cmd = ' '.join(sys.argv) filename = osp.join(experiments_root, osp.basename(opt_file)) copyfile(opt_file, filename) with open(filename, 'r+') as f: lines = f.readlines() lines.insert(0, f'# GENERATE TIME: {time.asctime()}\n# CMD:\n# {cmd}\n\n') f.seek(0) f.writelines(lines) ================================================ FILE: bsr/utils/plot_util.py ================================================ import re def read_data_from_tensorboard(log_path, tag): """Get raw data (steps and values) from tensorboard events. Args: log_path (str): Path to the tensorboard log. tag (str): tag to be read. """ from tensorboard.backend.event_processing.event_accumulator import EventAccumulator # tensorboard event event_acc = EventAccumulator(log_path) event_acc.Reload() scalar_list = event_acc.Tags()['scalars'] print('tag list: ', scalar_list) steps = [int(s.step) for s in event_acc.Scalars(tag)] values = [s.value for s in event_acc.Scalars(tag)] return steps, values def read_data_from_txt_2v(path, pattern, step_one=False): """Read data from txt with 2 returned values (usually [step, value]). Args: path (str): path to the txt file. pattern (str): re (regular expression) pattern. step_one (bool): add 1 to steps. Default: False. """ with open(path) as f: lines = f.readlines() lines = [line.strip() for line in lines] steps = [] values = [] pattern = re.compile(pattern) for line in lines: match = pattern.match(line) if match: steps.append(int(match.group(1))) values.append(float(match.group(2))) if step_one: steps = [v + 1 for v in steps] return steps, values def read_data_from_txt_1v(path, pattern): """Read data from txt with 1 returned values. Args: path (str): path to the txt file. pattern (str): re (regular expression) pattern. """ with open(path) as f: lines = f.readlines() lines = [line.strip() for line in lines] data = [] pattern = re.compile(pattern) for line in lines: match = pattern.match(line) if match: data.append(float(match.group(1))) return data def smooth_data(values, smooth_weight): """ Smooth data using 1st-order IIR low-pass filter (what tensorflow does). Reference: https://github.com/tensorflow/tensorboard/blob/f801ebf1f9fbfe2baee1ddd65714d0bccc640fb1/tensorboard/plugins/scalar/vz_line_chart/vz-line-chart.ts#L704 # noqa: E501 Args: values (list): A list of values to be smoothed. smooth_weight (float): Smooth weight. """ values_sm = [] last_sm_value = values[0] for value in values: value_sm = last_sm_value * smooth_weight + (1 - smooth_weight) * value values_sm.append(value_sm) last_sm_value = value_sm return values_sm ================================================ FILE: bsr/utils/registry.py ================================================ # Modified from: https://github.com/facebookresearch/fvcore/blob/master/fvcore/common/registry.py # noqa: E501 class Registry(): """ The registry that provides name -> object mapping, to support third-party users' custom modules. To create a registry (e.g. a backbone registry): .. code-block:: python BACKBONE_REGISTRY = Registry('BACKBONE') To register an object: .. code-block:: python @BACKBONE_REGISTRY.register() class MyBackbone(): ... Or: .. code-block:: python BACKBONE_REGISTRY.register(MyBackbone) """ def __init__(self, name): """ Args: name (str): the name of this registry """ self._name = name self._obj_map = {} def _do_register(self, name, obj, suffix=None): if isinstance(suffix, str): name = name + '_' + suffix assert (name not in self._obj_map), (f"An object named '{name}' was already registered " f"in '{self._name}' registry!") self._obj_map[name] = obj def register(self, obj=None, suffix=None): """ Register the given object under the the name `obj.__name__`. Can be used as either a decorator or not. See docstring of this class for usage. """ if obj is None: # used as a decorator def deco(func_or_class): name = func_or_class.__name__ self._do_register(name, func_or_class, suffix) return func_or_class return deco # used as a function call name = obj.__name__ self._do_register(name, obj, suffix) def get(self, name, suffix='basicsr'): ret = self._obj_map.get(name) if ret is None: ret = self._obj_map.get(name + '_' + suffix) print(f'Name {name} is not found, use name: {name}_{suffix}!') if ret is None: raise KeyError(f"No object named '{name}' found in '{self._name}' registry!") return ret def __contains__(self, name): return name in self._obj_map def __iter__(self): return iter(self._obj_map.items()) def keys(self): return self._obj_map.keys() DATASET_REGISTRY = Registry('dataset') ARCH_REGISTRY = Registry('arch') MODEL_REGISTRY = Registry('model') LOSS_REGISTRY = Registry('loss') METRIC_REGISTRY = Registry('metric') ================================================ FILE: config.yml ================================================ dataroot_gt: path_to_HR_images_of_LSDIR scale: 4 # the first degradation process resize_prob: [0.2, 0.7, 0.1] # up, down, keep resize_range: [0.3, 1.5] gaussian_noise_prob: 0.5 noise_range: [1, 15] poisson_scale_range: [0.05, 2.0] gray_noise_prob: 0.4 jpeg_range: [60, 95] # the second degradation process second_blur_prob: 0.5 resize_prob2: [0.3, 0.4, 0.3] # up, down, keep resize_range2: [0.6, 1.2] gaussian_noise_prob2: 0.5 noise_range2: [1, 12] poisson_scale_range2: [0.05, 1.0] gray_noise_prob2: 0.4 jpeg_range2: [60, 100] gt_size: 512 blur_kernel_size: 21 kernel_list: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso'] kernel_prob: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03] sinc_prob: 0.1 blur_sigma: [0.2, 1.5] betag_range: [0.5, 2.0] betap_range: [1, 1.5] blur_kernel_size2: 11 kernel_list2: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso'] kernel_prob2: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03] sinc_prob2: 0.1 blur_sigma2: [0.2, 1.0] betag_range2: [0.5, 2.0] betap_range2: [1, 1.5] final_sinc_prob: 0.8 use_hflip: True use_rot: False iter_num: 1000 ================================================ FILE: dataset.py ================================================ import torch, random, cv2, os, math, glob import torch.nn.functional as F import numpy as np from bsr.degradations import circular_lowpass_kernel, random_mixed_kernels, random_add_gaussian_noise_pt, random_add_poisson_noise_pt from bsr.transforms import augment, paired_random_crop from bsr.utils import FileClient, imfrombytes, img2tensor, DiffJPEG from bsr.utils.img_process_util import filter2D class RealESRGANDataset(torch.utils.data.Dataset): def __init__(self, opt, bsz): super(RealESRGANDataset, self).__init__() self.opt = opt self.file_client = FileClient("disk") self.gt_folder = opt["dataroot_gt"] self.len = bsz * opt["iter_num"] self.paths = glob.glob(os.path.join(self.gt_folder, "**/*"), recursive=True) # blur settings for the first degradation self.blur_kernel_size = opt["blur_kernel_size"] self.kernel_list = opt["kernel_list"] self.kernel_prob = opt["kernel_prob"] # a list for each kernel probability self.blur_sigma = opt["blur_sigma"] self.betag_range = opt["betag_range"] # betag used in generalized Gaussian blur kernels self.betap_range = opt["betap_range"] # betap used in plateau blur kernels self.sinc_prob = opt["sinc_prob"] # the probability for sinc filters # blur settings for the second degradation self.blur_kernel_size2 = opt["blur_kernel_size2"] self.kernel_list2 = opt["kernel_list2"] self.kernel_prob2 = opt["kernel_prob2"] self.blur_sigma2 = opt["blur_sigma2"] self.betag_range2 = opt["betag_range2"] self.betap_range2 = opt["betap_range2"] self.sinc_prob2 = opt["sinc_prob2"] # a final sinc filter self.final_sinc_prob = opt["final_sinc_prob"] self.kernel_range = [2 * v + 1 for v in range(3, 11)] # kernel size ranges from 7 to 21 # TODO: kernel range is now hard-coded, should be in the configure file self.pulse_tensor = torch.zeros(21, 21).float() # convolving with pulse tensor brings no blurry effect self.pulse_tensor[10, 10] = 1 def __getitem__(self, index): index = random.randint(0, len(self.paths) - 1) gt_path = self.paths[index] img_gt = imfrombytes(self.file_client.get(gt_path, "gt"), float32=True) img_gt = augment(img_gt, self.opt["use_hflip"], self.opt["use_rot"]) h, w = img_gt.shape[0:2] crop_pad_size = self.opt.gt_size if h < crop_pad_size or w < crop_pad_size: pad_h = max(0, crop_pad_size - h) pad_w = max(0, crop_pad_size - w) img_gt = cv2.copyMakeBorder(img_gt, 0, pad_h, 0, pad_w, cv2.BORDER_REFLECT_101) if img_gt.shape[0] > crop_pad_size or img_gt.shape[1] > crop_pad_size: h, w = img_gt.shape[0:2] top = random.randint(0, h - crop_pad_size) left = random.randint(0, w - crop_pad_size) img_gt = img_gt[top:top + crop_pad_size, left:left + crop_pad_size, ...] # ------------------------ Generate kernels (used in the first degradation) ------------------------ # kernel_size = random.choice(self.kernel_range) if np.random.uniform() < self.opt["sinc_prob"]: # this sinc filter setting is for kernels ranging from [7, 21] if kernel_size < 13: omega_c = np.random.uniform(np.pi / 3, np.pi) else: omega_c = np.random.uniform(np.pi / 5, np.pi) kernel = circular_lowpass_kernel(omega_c, kernel_size, pad_to=False) else: kernel = random_mixed_kernels( self.kernel_list, self.kernel_prob, kernel_size, self.blur_sigma, self.blur_sigma, [-math.pi, math.pi], self.betag_range, self.betap_range, noise_range=None) # pad kernel pad_size = (21 - kernel_size) // 2 kernel = np.pad(kernel, ((pad_size, pad_size), (pad_size, pad_size))) # ------------------------ Generate kernels (used in the second degradation) ------------------------ # kernel_size = random.choice(self.kernel_range) if np.random.uniform() < self.opt["sinc_prob2"]: if kernel_size < 13: omega_c = np.random.uniform(np.pi / 3, np.pi) else: omega_c = np.random.uniform(np.pi / 5, np.pi) kernel2 = circular_lowpass_kernel(omega_c, kernel_size, pad_to=False) else: kernel2 = random_mixed_kernels( self.kernel_list2, self.kernel_prob2, kernel_size, self.blur_sigma2, self.blur_sigma2, [-math.pi, math.pi], self.betag_range2, self.betap_range2, noise_range=None) # pad kernel pad_size = (21 - kernel_size) // 2 kernel2 = np.pad(kernel2, ((pad_size, pad_size), (pad_size, pad_size))) # ------------------------------------- the final sinc kernel ------------------------------------- # if np.random.uniform() < self.opt["final_sinc_prob"]: kernel_size = random.choice(self.kernel_range) omega_c = np.random.uniform(np.pi / 3, np.pi) sinc_kernel = circular_lowpass_kernel(omega_c, kernel_size, pad_to=21) sinc_kernel = torch.FloatTensor(sinc_kernel) else: sinc_kernel = self.pulse_tensor # BGR to RGB, HWC to CHW, numpy to tensor img_gt = img2tensor([img_gt], bgr2rgb=True, float32=True)[0] kernel = torch.FloatTensor(kernel) kernel2 = torch.FloatTensor(kernel2) return_d = {"gt": img_gt, "kernel1": kernel, "kernel2": kernel2, "sinc_kernel": sinc_kernel, "gt_path": gt_path} return return_d def __len__(self): return self.len class RealESRGANDegrader: def __init__(self, opt, device): self.opt = opt self.device = device self.jpeger = DiffJPEG(differentiable=False).to(device) # simulate JPEG compression artifacts self.queue_size = 1200 @torch.no_grad() def _dequeue_and_enqueue(self): """It is the training pair pool for increasing the diversity in a batch. Batch processing limits the diversity of synthetic degradations in a batch. For example, samples in a batch could not have different resize scaling factors. Therefore, we employ this training pair pool to increase the degradation diversity in a batch. """ # initialize b, c, h, w = self.lq.size() if not hasattr(self, "queue_lr"): assert self.queue_size % b == 0, f"queue size {self.queue_size} should be divisible by batch size {b}" self.queue_lr = torch.zeros(self.queue_size, c, h, w).to(self.device) _, c, h, w = self.gt.size() self.queue_gt = torch.zeros(self.queue_size, c, h, w).to(self.device) self.queue_ptr = 0 if self.queue_ptr == self.queue_size: # the pool is full # do dequeue and enqueue # shuffle idx = torch.randperm(self.queue_size) self.queue_lr = self.queue_lr[idx] self.queue_gt = self.queue_gt[idx] # get first b samples lq_dequeue = self.queue_lr[0:b, :, :, :].clone() gt_dequeue = self.queue_gt[0:b, :, :, :].clone() # update the queue self.queue_lr[0:b, :, :, :] = self.lq.clone() self.queue_gt[0:b, :, :, :] = self.gt.clone() self.lq = lq_dequeue self.gt = gt_dequeue else: # only do enqueue self.queue_lr[self.queue_ptr:self.queue_ptr + b, :, :, :] = self.lq.clone() self.queue_gt[self.queue_ptr:self.queue_ptr + b, :, :, :] = self.gt.clone() self.queue_ptr = self.queue_ptr + b @torch.no_grad() def degrade(self, data): """Accept data from dataloader, and then add two-order degradations to obtain LQ images. """ # training data synthesis self.gt = data["gt"].to(self.device) self.kernel1 = data["kernel1"].to(self.device) self.kernel2 = data["kernel2"].to(self.device) self.sinc_kernel = data["sinc_kernel"].to(self.device) ori_h, ori_w = self.gt.size()[2:4] # ----------------------- The first degradation process ----------------------- # # blur out = filter2D(self.gt, self.kernel1) # random resize updown_type = random.choices(["up", "down", "keep"], self.opt["resize_prob"])[0] if updown_type == "up": scale = np.random.uniform(1, self.opt["resize_range"][1]) elif updown_type == "down": scale = np.random.uniform(self.opt["resize_range"][0], 1) else: scale = 1 mode = random.choice(["area", "bilinear", "bicubic"]) out = F.interpolate(out, scale_factor=scale, mode=mode) # add noise gray_noise_prob = self.opt["gray_noise_prob"] if np.random.uniform() < self.opt["gaussian_noise_prob"]: out = random_add_gaussian_noise_pt( out, sigma_range=self.opt["noise_range"], clip=True, rounds=False, gray_prob=gray_noise_prob) else: out = random_add_poisson_noise_pt( out, scale_range=self.opt["poisson_scale_range"], gray_prob=gray_noise_prob, clip=True, rounds=False) # JPEG compression jpeg_p = out.new_zeros(out.size(0)).uniform_(*self.opt["jpeg_range"]) out = torch.clamp(out, 0, 1) # clamp to [0, 1], otherwise JPEGer will result in unpleasant artifacts out = self.jpeger(out, quality=jpeg_p) # ----------------------- The second degradation process ----------------------- # # blur if np.random.uniform() < self.opt["second_blur_prob"]: out = filter2D(out, self.kernel2) # random resize updown_type = random.choices(["up", "down", "keep"], self.opt["resize_prob2"])[0] if updown_type == "up": scale = np.random.uniform(1, self.opt["resize_range2"][1]) elif updown_type == "down": scale = np.random.uniform(self.opt["resize_range2"][0], 1) else: scale = 1 mode = random.choice(["area", "bilinear", "bicubic"]) out = F.interpolate( out, size=(int(ori_h / self.opt["scale"] * scale), int(ori_w / self.opt["scale"] * scale)), mode=mode) # add noise gray_noise_prob = self.opt["gray_noise_prob2"] if np.random.uniform() < self.opt["gaussian_noise_prob2"]: out = random_add_gaussian_noise_pt( out, sigma_range=self.opt["noise_range2"], clip=True, rounds=False, gray_prob=gray_noise_prob) else: out = random_add_poisson_noise_pt( out, scale_range=self.opt["poisson_scale_range2"], gray_prob=gray_noise_prob, clip=True, rounds=False) # JPEG compression + the final sinc filter # We also need to resize images to desired sizes. We group [resize back + sinc filter] together # as one operation. # We consider two orders: # 1. [resize back + sinc filter] + JPEG compression # 2. JPEG compression + [resize back + sinc filter] # Empirically, we find other combinations (sinc + JPEG + Resize) will introduce twisted lines. if np.random.uniform() < 0.5: # resize back + the final sinc filter mode = random.choice(["area", "bilinear", "bicubic"]) out = F.interpolate(out, size=(ori_h // self.opt["scale"], ori_w // self.opt["scale"]), mode=mode) out = filter2D(out, self.sinc_kernel) # JPEG compression jpeg_p = out.new_zeros(out.size(0)).uniform_(*self.opt["jpeg_range2"]) out = torch.clamp(out, 0, 1) out = self.jpeger(out, quality=jpeg_p) else: # JPEG compression jpeg_p = out.new_zeros(out.size(0)).uniform_(*self.opt["jpeg_range2"]) out = torch.clamp(out, 0, 1) out = self.jpeger(out, quality=jpeg_p) # resize back + the final sinc filter mode = random.choice(["area", "bilinear", "bicubic"]) out = F.interpolate(out, size=(ori_h // self.opt["scale"], ori_w // self.opt["scale"]), mode=mode) out = filter2D(out, self.sinc_kernel) # clamp and round self.lq = torch.clamp((out * 255.0).round(), 0, 255) / 255. # random crop gt_size = self.opt["gt_size"] self.gt, self.lq = paired_random_crop(self.gt, self.lq, gt_size, self.opt["scale"]) # training pair pool self._dequeue_and_enqueue() # sharpen self.gt again, as we have changed the self.gt with self._dequeue_and_enqueue self.lq = self.lq.contiguous() # for the warning: grad and param do not obey the gradient layout contract return self.lq, self.gt ================================================ FILE: evaluate.py ================================================ import torch, os, glob, pyiqa from argparse import ArgumentParser import numpy as np from PIL import Image from tqdm import tqdm from torchvision import transforms parser = ArgumentParser() parser.add_argument("--HR_dir", type=str, default="testset/RealSR/HR") parser.add_argument("--SR_dir", type=str, default="result/RealSR") args = parser.parse_args() device = torch.device("cuda") psnr = pyiqa.create_metric("psnr", test_y_channel=True, color_space="ycbcr", device=device) ssim = pyiqa.create_metric("ssim", test_y_channel=True, color_space="ycbcr", device=device) lpips = pyiqa.create_metric("lpips", device=device) dists = pyiqa.create_metric("dists", device=device) fid = pyiqa.create_metric("fid", device=device) niqe = pyiqa.create_metric("niqe", device=device) maniqa = pyiqa.create_metric("maniqa-pipal", device=device) clipiqa = pyiqa.create_metric("clipiqa", device=device) musiq = pyiqa.create_metric("musiq", device=device) test_SR_paths = list(sorted(glob.glob(os.path.join(args.SR_dir, "*")))) test_HR_paths = list(sorted(glob.glob(os.path.join(args.HR_dir, "*")))) metrics = {"psnr": [], "ssim": [], "lpips": [], "dists": [], "niqe": [], "maniqa": [], "musiq": [], "clipiqa": []} for i, (SR_path, HR_path) in tqdm(enumerate(zip(test_SR_paths, test_HR_paths))): SR = Image.open(SR_path).convert("RGB") SR = transforms.ToTensor()(SR).to(device).unsqueeze(0) HR = Image.open(HR_path).convert("RGB") HR = transforms.ToTensor()(HR).to(device).unsqueeze(0) metrics["psnr"].append(psnr(SR, HR).item()) metrics["ssim"].append(ssim(SR, HR).item()) metrics["lpips"].append(lpips(SR, HR).item()) metrics["dists"].append(dists(SR, HR).item()) metrics["niqe"].append(niqe(SR).item()) metrics["maniqa"].append(maniqa(SR).item()) metrics["clipiqa"].append(clipiqa(SR).item()) metrics["musiq"].append(musiq(SR).item()) for k in metrics.keys(): metrics[k] = np.mean(metrics[k]) metrics["fid"] = fid(args.SR_dir, args.HR_dir) for k, v in metrics.items(): if k == "niqe": print(k, f"{v:.3g}") elif k == "fid": print(k, f"{v:.5g}") else: print(k, f"{v:.4g}") ================================================ FILE: evaluate_debug.sh ================================================ HF_ENDPOINT=https://hf-mirror.com \ CUDA_VISIBLE_DEVICES=0 \ python -u evaluate.py \ --HR_dir=testset/RealSR/HR \ --SR_dir=result/RealSR ================================================ FILE: forward.py ================================================ import torch def MyUNet2DConditionModel_SD_forward(self, x): global skip x = self.conv_in(x) skip = [x] x = self.body(x) return x def MyCrossAttnDownBlock2D_SD_forward(self, x): for i in range(2): x = self.resnets[i](x) x = self.attentions[i](x) skip.append(x) if self.downsamplers is not None: x = self.downsamplers[0](x) skip.append(x) return x def MyCrossAttnUpBlock2D_SD_forward(self, x): for i in range(3): x = self.resnets[i](torch.cat([x, skip.pop()], dim=1)) x = self.attentions[i](x) if self.upsamplers is not None: x = self.upsamplers[0](x) return x def MyDownBlock2D_SD_forward(self, x): for i in range(2): x = self.resnets[i](x) skip.append(x) return x def MyUNetMidBlock2DCrossAttn_SD_forward(self, x): x = self.resnets[0](x) x = self.attentions[0](x) x = self.resnets[1](x) return x def MyUpBlock2D_SD_forward(self, x): for i in range(3): x = self.resnets[i](torch.cat([x, skip.pop()], dim=1)) x = self.upsamplers[0](x) return x def MyResnetBlock2D_SD_forward(self, x_in): x = self.norm1(x_in) x = self.nonlinearity(x) x = self.conv1(x) x = self.norm2(x) x = self.nonlinearity(x) x = self.conv2(x) if self.in_channels == self.out_channels: return x + x_in return x + self.conv_shortcut(x_in) def MyTransformer2DModel_SD_forward(self, x_in): b, c, h, w = x_in.shape x = self.norm(x_in) x = x.permute(0, 2, 3, 1).reshape(b, h * w, c).contiguous() x = self.proj_in(x) for block in self.transformer_blocks: x = x + block.attn1(block.norm1(x)) x = x + block.ff(block.norm3(x)) x = self.proj_out(x) x = x.reshape(b, h, w, c).permute(0, 3, 1, 2).contiguous() return x + x_in ================================================ FILE: model.py ================================================ import torch, types, copy from torch import nn import torch.nn.functional as F from diffusers.models.unets.unet_2d_blocks import CrossAttnDownBlock2D, \ CrossAttnUpBlock2D, \ DownBlock2D, \ UpBlock2D, \ UNetMidBlock2DCrossAttn from diffusers.models.resnet import ResnetBlock2D from diffusers.models.transformers.transformer_2d import Transformer2DModel from diffusers.models.attention import BasicTransformerBlock from diffusers.models.downsampling import Downsample2D from diffusers.models.upsampling import Upsample2D from forward import MyUNet2DConditionModel_SD_forward, \ MyCrossAttnDownBlock2D_SD_forward, \ MyDownBlock2D_SD_forward, \ MyUNetMidBlock2DCrossAttn_SD_forward, \ MyCrossAttnUpBlock2D_SD_forward, \ MyUpBlock2D_SD_forward, \ MyResnetBlock2D_SD_forward, \ MyTransformer2DModel_SD_forward def find_parent(model, module_name): components = module_name.split(".") parent = model for comp in components[:-1]: parent = getattr(parent, comp) return parent, components[-1] def halve_channels(model): for name, module in model.named_modules(): if hasattr(module, "pruned"): continue if isinstance(module, nn.Conv2d): in_channels = int(module.in_channels * 0.75) out_channels = int(module.out_channels * 0.75) new_conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=module.kernel_size, stride=module.stride, padding=module.padding, dilation=module.dilation, groups=module.groups, bias=module.bias is not None) with torch.no_grad(): new_conv.weight.copy_(module.weight[:out_channels, :in_channels]) if module.bias is not None: new_conv.bias.copy_(module.bias[:out_channels]) parent, last_name = find_parent(model, name) setattr(parent, last_name, new_conv) new_conv.pruned = True elif isinstance(module, nn.Linear): in_features = int(module.in_features * 0.75) out_features = int(module.out_features * 0.75) new_linear = nn.Linear(in_features=in_features, out_features=out_features, bias=module.bias is not None) with torch.no_grad(): new_linear.weight.copy_(module.weight[:out_features, :in_features]) if module.bias is not None: new_linear.bias.copy_(module.bias[:out_features]) parent, last_name = find_parent(model, name) setattr(parent, last_name, new_linear) new_linear.pruned = True elif isinstance(module, nn.GroupNorm): num_channels = int(module.num_channels * 0.75) for num_groups in [32, 24, 16, 12, 8, 6, 4, 2, 1]: if num_channels % num_groups == 0: break new_gn = nn.GroupNorm(num_groups=num_groups, num_channels=num_channels, eps=module.eps, affine=module.affine) with torch.no_grad(): new_gn.weight.copy_(module.weight[:num_channels]) new_gn.bias.copy_(module.bias[:num_channels]) parent, last_name = find_parent(model, name) setattr(parent, last_name, new_gn) new_gn.pruned = True elif isinstance(module, nn.LayerNorm): normalized_shape = int(module.normalized_shape[0] * 0.75) new_ln = nn.LayerNorm(normalized_shape, eps=module.eps, elementwise_affine=module.elementwise_affine) with torch.no_grad(): new_ln.weight.copy_(module.weight[:normalized_shape]) new_ln.bias.copy_(module.bias[:normalized_shape]) parent, last_name = find_parent(model, name) setattr(parent, last_name, new_ln) new_ln.pruned = True elif isinstance(module, Downsample2D) or isinstance(module, Upsample2D): module.channels = int(module.channels * 0.75) class Net(nn.Module): def __init__(self, unet, decoder): super().__init__() del unet.time_embedding new_conv_in = nn.Conv2d(16, 320, 3, padding=1) new_conv_in.weight.data = unet.conv_in.weight.data.repeat(1, 4, 1, 1) new_conv_in.bias.data = unet.conv_in.bias.data unet.conv_in = new_conv_in new_conv_out = nn.Conv2d(320, 342, 3, padding=1) new_conv_out.weight.data = unet.conv_out.weight.data.repeat(86, 1, 1, 1)[:342] new_conv_out.bias.data = unet.conv_out.bias.data.repeat(86,)[:342] unet.conv_out = new_conv_out def ResnetBlock2D_remove_time_emb_proj(module): if isinstance(module, ResnetBlock2D): del module.time_emb_proj unet.apply(ResnetBlock2D_remove_time_emb_proj) def BasicTransformerBlock_remove_cross_attn(module): if isinstance(module, BasicTransformerBlock): del module.attn2, module.norm2 unet.apply(BasicTransformerBlock_remove_cross_attn) def set_inplace_to_true(module): if isinstance(module, nn.Dropout) or isinstance(module, nn.SiLU): module.inplace = True unet.apply(set_inplace_to_true) def replace_forward_methods(module): if isinstance(module, CrossAttnDownBlock2D): module.forward = types.MethodType(MyCrossAttnDownBlock2D_SD_forward, module) elif isinstance(module, DownBlock2D): module.forward = types.MethodType(MyDownBlock2D_SD_forward, module) elif isinstance(module, UNetMidBlock2DCrossAttn): module.forward = types.MethodType(MyUNetMidBlock2DCrossAttn_SD_forward, module) elif isinstance(module, UpBlock2D): module.forward = types.MethodType(MyUpBlock2D_SD_forward, module) elif isinstance(module, CrossAttnUpBlock2D): module.forward = types.MethodType(MyCrossAttnUpBlock2D_SD_forward, module) elif isinstance(module, ResnetBlock2D): module.forward = types.MethodType(MyResnetBlock2D_SD_forward, module) elif isinstance(module, Transformer2DModel): module.forward = types.MethodType(MyTransformer2DModel_SD_forward, module) unet.apply(replace_forward_methods) unet.forward = types.MethodType(MyUNet2DConditionModel_SD_forward, unet) halve_channels(unet) unet.body = nn.Sequential( *unet.down_blocks, unet.mid_block, *unet.up_blocks, unet.conv_norm_out, unet.conv_act, unet.conv_out, ) del decoder.conv_in, decoder.up_blocks, decoder.conv_norm_out, decoder.conv_act, decoder.conv_out self.body = nn.Sequential( nn.PixelUnshuffle(2), unet, decoder.mid_block, ) def forward(self, x): return self.body(x) ================================================ FILE: ram/configs/condition_config.json ================================================ { "nf": 64 } ================================================ FILE: ram/configs/med_config.json ================================================ { "architectures": [ "BertModel" ], "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "type_vocab_size": 2, "vocab_size": 30524, "encoder_width": 768, "add_cross_attention": true } ================================================ FILE: ram/configs/q2l_config.json ================================================ { "architectures": [ "BertModel" ], "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 4, "num_hidden_layers": 2, "pad_token_id": 0, "type_vocab_size": 2, "vocab_size": 30522, "encoder_width": 768, "add_cross_attention": true, "add_tag_cross_attention": false } ================================================ FILE: ram/configs/swin/config_swinB_384.json ================================================ { "ckpt": "pretrain_model/swin_base_patch4_window7_224_22k.pth", "vision_width": 1024, "image_res": 384, "window_size": 12, "embed_dim": 128, "depths": [ 2, 2, 18, 2 ], "num_heads": [ 4, 8, 16, 32 ] } ================================================ FILE: ram/configs/swin/config_swinL_384.json ================================================ { "ckpt": "pretrain_model/swin_large_patch4_window12_384_22k.pth", "vision_width": 1536, "image_res": 384, "window_size": 12, "embed_dim": 192, "depths": [ 2, 2, 18, 2 ], "num_heads": [ 6, 12, 24, 48 ] } ================================================ FILE: ram/configs/swin/config_swinL_444.json ================================================ { "ckpt": "pretrain_model/swin_large_patch4_window12_384_22k.pth", "vision_width": 1536, "image_res": 444, "window_size": 12, "embed_dim": 192, "depths": [ 2, 2, 18, 2 ], "num_heads": [ 6, 12, 24, 48 ] } ================================================ FILE: ram/data/ram_tag_list.txt ================================================ 3D CG rendering 3D glasses abacus abalone monastery belly academy accessory accident accordion acorn acrylic paint act action action film activity actor adaptation add adhesive tape adjust adult adventure advertisement antenna aerobics spray can afro agriculture aid air conditioner air conditioning air sock aircraft cabin aircraft model air field air line airliner airman plane airplane window airport airport runway airport terminal airship airshow aisle alarm alarm clock mollymawk album album cover alcohol alcove algae alley almond aloe vera alp alpaca alphabet german shepherd altar amber ambulance bald eagle American shorthair amethyst amphitheater amplifier amusement park amusement ride anchor ancient anemone angel angle animal animal sculpture animal shelter animation animation film animator anime ankle anklet anniversary trench coat ant antelope antique antler anvil apartment ape app app icon appear appearance appetizer applause apple apple juice apple pie apple tree applesauce appliance appointment approach apricot apron aqua aquarium aquarium fish aqueduct arcade arcade machine arch arch bridge archaelogical excavation archery archipelago architect architecture archive archway area arena argument arm armadillo armband armchair armoire armor army army base army tank array arrest arrow art art exhibition art gallery art print art school art studio art vector illustration artichoke article artifact artist artists loft ash ashtray asia temple asparagus asphalt road assemble assembly assembly line association astronaut astronomer athlete athletic atlas atm atmosphere atrium attach fighter jet attend attraction atv eggplant auction audi audio auditorium aurora author auto factory auto mechanic auto part auto show auto showroom car battery automobile make automobile model motor vehicle autumn autumn forest autumn leave autumn park autumn tree avatar avenue aviator sunglasses avocado award award ceremony award winner shed ax azalea baboon baby baby bottle baby carriage baby clothe baby elephant baby food baby seat baby shower back backdrop backlight backpack backyard bacon badge badger badlands badminton badminton racket bag bagel bagpipe baguette bait baked goods baker bakery baking baking sheet balance balance car balcony ball ball pit ballerina ballet ballet dancer ballet skirt balloon balloon arch baseball player ballroom bamboo bamboo forest banana banana bread banana leaf banana tree band band aid bandage headscarf bandeau bangs bracelet balustrade banjo bank bank card bank vault banknote banner banquet banquet hall banyan tree baozi baptism bar bar code bar stool barbecue barbecue grill barbell barber barber shop barbie barge barista bark barley barn barn owl barn door barrel barricade barrier handcart bartender baseball baseball base baseball bat baseball hat baseball stadium baseball game baseball glove baseball pitcher baseball team baseball uniform basement basil basin basket basket container basketball basketball backboard basketball coach basketball court basketball game basketball hoop basketball player basketball stadium basketball team bass bass guitar bass horn bassist bat bath bath heater bath mat bath towel swimwear bathrobe bathroom bathroom accessory bathroom cabinet bathroom door bathroom mirror bathroom sink toilet paper bathroom window batman wand batter battery battle battle rope battleship bay bay bridge bay window bayberry bazaar beach beach ball beach chair beach house beach hut beach towel beach volleyball lighthouse bead beagle beak beaker beam bean bean bag chair beanbag bear bear cub beard beast beat beautiful beauty beauty salon beaver bed bedcover bed frame bedroom bedding bedpan bedroom window bedside lamp bee beech tree beef beekeeper beeper beer beer bottle beer can beer garden beer glass beer hall beet beetle beige clock bell pepper bell tower belt belt buckle bench bend bengal tiger bento beret berry berth beverage bib bibimbap bible bichon bicycle bicycle helmet bicycle wheel biker bidet big ben bike lane bike path bike racing bike ride bikini bikini top bill billard billboard billiard table bin binder binocular biology laboratory biplane birch birch tree bird bird bath bird feeder bird house bird nest birdbath bird cage birth birthday birthday cake birthday candle birthday card birthday party biscuit bishop bison bit bite black black sheep blackberry blackbird blackboard blacksmith blade blanket sports coat bleacher blender blessing blind eye mask flasher snowstorm block blog blood bloom blossom blouse blow hair drier blowfish blue blue artist blue jay blue sky blueberry bluebird pig board board eraser board game boardwalk boat boat deck boat house paddle boat ride bobfloat bobcat body bodyboard bodybuilder boiled egg boiler bolo tie bolt bomb bomber bonasa umbellu bone bonfire bonnet bonsai book book cover bookcase folder bookmark bookshelf bookstore boom microphone boost boot border Border collie botanical garden bottle bottle cap bottle opener bottle screw bougainvillea boulder bouquet boutique boutique hotel bow bow tie bow window bowl bowling bowling alley bowling ball bowling equipment box box girder bridge box turtle boxer underdrawers boxing boxing glove boxing ring boy brace bracket braid brain brake brake light branch brand brandy brass brass plaque bread breadbox break breakfast seawall chest brewery brick brick building wall brickwork wedding dress bride groom bridesmaid bridge bridle briefcase bright brim broach broadcasting broccoli bronze bronze medal bronze sculpture bronze statue brooch creek broom broth brown brown bear brownie brunch brunette brush coyote brussels sprout bubble bubble gum bubble tea bucket cabinet shield bud buddha buffalo buffet bug build builder building building block building facade building material lamp bull bulldog bullet bullet train bulletin board bulletproof vest bullfighting megaphone bullring bumblebee bumper roll bundle bungee bunk bed bunker bunny buoy bureau burial chamber burn burrito bus bus driver bus interior bus station bus stop bus window bush business business card business executive business suit business team business woman businessman bust butcher butchers shop butte butter cream butterfly butterfly house button buttonwood buy taxi cabana cabbage cabin cabin car cabinet cabinetry cable cable car cactus cafe canteen cage cake cake stand calculator caldron calendar calf call phone box calligraphy calm camcorder camel camera camera lens camouflage camp camper campfire camping campsite campus can can opener canal canary cancer candle candle holder candy candy bar candy cane candy store cane jar cannon canopy canopy bed cantaloupe cantilever bridge canvas canyon cap cape cape cod cappuccino capsule captain capture car car dealership car door car interior car logo car mirror parking lot car seat car show car wash car window caramel card card game cardboard cardboard box cardigan cardinal cargo cargo aircraft cargo ship caribbean carnation carnival carnivore carousel carp carpenter carpet slipper house finch coach dalmatian aircraft carrier carrot carrot cake carry cart carton cartoon cartoon character cartoon illustration cartoon style carve case cash cashew casino casserole cassette cassette deck plaster bandage casting castle cat cat bed cat food cat furniture cat tree catacomb catamaran catamount catch catcher caterpillar catfish cathedral cattle catwalk catwalk show cauliflower cave caviar CD CD player cedar ceiling ceiling fan celebrate celebration celebrity celery cello smartphone cement graveyard centerpiece centipede ceramic ceramic tile cereal ceremony certificate chain chain saw chair chairlift daybed chalet chalice chalk chamber chameleon champagne champagne flute champion championship chandelier changing table channel chap chapel character sculpture charcoal charge charger chariot charity charity event charm graph chase chassis check checkbook chessboard checklist cheer cheerlead cheese cheeseburger cheesecake cheetah chef chemical compound chemist chemistry chemistry lab cheongsam cherry cherry blossom cherry tomato cherry tree chess chestnut chicken chicken breast chicken coop chicken salad chicken wing garbanzo chiffonier chihuahua child child actor childs room chile chili dog chimney chimpanzee chinaware chinese cabbage chinese garden chinese knot chinese rose chinese tower chip chipmunk chisel chocolate chocolate bar chocolate cake chocolate chip chocolate chip cookie chocolate milk chocolate mousse truffle choir kitchen knife cutting board chopstick christmas christmas ball christmas card christmas decoration christmas dinner christmas eve christmas hat christmas light christmas market christmas ornament christmas tree chrysanthemum church church tower cider cigar cigar box cigarette cigarette case waistband cinema photographer cinnamon circle circuit circuit board circus water tank citrus fruit city city bus city hall city nightview city park city skyline city square city street city wall city view clam clarinet clasp class classic classroom clavicle claw clay pottery clean clean room cleaner cleaning product clear cleat clementine client cliff climb climb mountain climber clinic clip clip art clipboard clipper clivia cloak clogs close-up closet cloth clothe clothing clothespin clothesline clothing store cloud cloud forest cloudy clover joker clown fish club clutch clutch bag coal coast coat coatrack cob cock cockatoo cocker cockpit roach cocktail cocktail dress cocktail shaker cocktail table cocoa coconut coconut tree coffee coffee bean coffee cup coffee machine coffee shop coffeepot coffin cognac spiral coin coke colander cold slaw collaboration collage collection college student sheepdog crash color coloring book coloring material pony pillar comb combination lock comic comedy comedy film comet comfort comfort food comic book comic book character comic strip commander commentator community commuter company compass compete contest competitor composer composition compost computer computer box computer chair computer desk keyboard computer monitor computer room computer screen computer tower concept car concert concert hall conch concrete condiment condom condominium conductor cone meeting conference center conference hall meeting room confetti conflict confluence connect connector conservatory constellation construction site construction worker contain container container ship continent profile contract control control tower convenience store convention conversation converter convertible transporter cook cooking cooking spray cooker cool cooler copper copy coral coral reef rope corded phone liquor corgi cork corkboard cormorant corn corn field cornbread corner trumpet cornice cornmeal corral corridor corset cosmetic cosmetics brush cosmetics mirror cosplay costume costumer film designer infant bed cottage cotton cotton candy couch countdown counter counter top country artist country house country lane country pop artist countryside coupe couple couple photo courgette course court courthouse courtyard cousin coverall cow cowbell cowboy cowboy boot cowboy hat crab crabmeat crack cradle craft craftsman cranberry crane crape crapper crate crater lake lobster crayon cream cheese cream pitcher create creature credit card crescent croissant crest crew cricket cricket ball cricket team cricketer crochet crock pot crocodile crop crop top cross crossbar crossroad crosstalk crosswalk crouton crow crowbar crowd crowded crown crt screen crucifix cruise cruise ship cruiser crumb crush crutch crystal cub cube cucumber cue cuff cufflink cuisine farmland cup cupcake cupid curb curl hair roller currant currency curry curtain curve pad customer cut cutlery cycle cycling cyclone cylinder cymbal cypress cypress tree dachshund daffodil dagger dahlia daikon dairy daisy dam damage damp dance dance floor dance room dancer dandelion dark darkness dart dartboard dashboard date daughter dawn day bed daylight deadbolt death debate debris decanter deck decker bus decor decorate decorative picture deer defender deity delicatessen deliver demolition monster demonstration den denim jacket dentist department store depression derby dermopathy desert desert road design designer table table lamp desktop desktop computer dessert destruction detective detergent dew dial diamond diaper diaper bag journal die diet excavator number digital clock dill dinner rowboat dining room dinner party dinning table dinosaur dip diploma direct director dirt dirt bike dirt field dirt road dirt track disaster disciple disco disco ball discotheque disease plate dish antenna dish washer dishrag dishes dishsoap Disneyland dispenser display display window trench dive diver diving board paper cup dj doberman dock doctor document documentary dog dog bed dog breed dog collar dog food dog house doll dollar dollhouse dolly dolphin dome domicile domino donkey donut doodle door door handle doormat doorplate doorway dormitory dough downtown dozer drag dragon dragonfly drain drama drama film draw drawer drawing drawing pin pigtail dress dress hat dress shirt dress shoe dress suit dresser dressing room dribble drift driftwood drill drink drinking water drive driver driveway drone drop droplight dropper drought medicine pharmacy drum drummer drumstick dry duchess duck duckbill duckling duct tape dude duet duffel canoe dumbbell dumpling dune dunk durian dusk dust garbage truck dustpan duvet DVD dye eagle ear earmuff earphone earplug earring earthquake easel easter easter bunny easter egg eat restaurant eclair eclipse ecosystem edit education educator eel egg egg roll egg tart eggbeater egret Eiffel tower elastic band senior electric chair electric drill electrician electricity electron electronic elephant elevation map elevator elevator car elevator door elevator lobby elevator shaft embankment embassy embellishment ember emblem embroidery emerald emergency emergency service emergency vehicle emotion Empire State Building enamel enclosure side table energy engagement engagement ring engine engine room engineer engineering english shorthair ensemble enter entertainer entertainment entertainment center entrance entrance hall envelope equestrian equipment eraser erhu erosion escalator escargot espresso estate estuary eucalyptus tree evening evening dress evening light evening sky evening sun event evergreen ewe excavation exercise exhaust hood exhibition exit explorer explosion extension cord extinguisher extractor extrude eye eye shadow eyebrow eyeliner fabric fabric store facade face face close-up face powder face towel facial tissue holder facility factory factory workshop fair fairground fairy falcon fall family family car family photo family room fan fang farm farmer farmer market farmhouse fashion fashion accessory fashion designer fashion girl fashion illustration fashion look fashion model fashion show fast food fastfood restaurant father faucet fault fauna fawn fax feast feather fedora feed feedbag feeding feeding chair feline mountain lion fence fender fern ferret ferris wheel ferry fertilizer festival fiber fiction fiction book field field road fig fight figure skater figurine file file photo file cabinet fill film camera film director film format film premiere film producer filming filter fin hand finish line fir fir tree fire fire alarm fire department fire truck fire escape fire hose fire pit fire station firecracker fireman fireplace firework firework display first-aid kit fish fish boat fish market fish pond fishbowl fisherman fishing fishing boat fishing net fishing pole fishing village fitness fitness course five fixture fjord flag flag pole flake flame flamingo flannel flap flare flash flask flat flatfish flavor flea flea market fleet flight flight attendant flip flip-flop flipchart float flock flood floor floor fan floor mat floor plan floor window floral arrangement florist floss flour flow flower flower basket flower bed flower box flower field flower girl flower market fluid flush flute fly fly fishing flyer horse foam fog foggy foie gra foil folding chair leaf folk artist folk dance folk rock artist fondant hotpot font food food coloring food court food processor food stand food truck foosball foot foot bridge football football coach football college game football match football field football game football helmet football player football stadium football team path footprint footrest footstall footwear forbidden city ford forehead forest forest fire forest floor forest path forest road forge fork forklift form formal garden formation formula 1 fort fortification forward fossil foundation fountain fountain pen fox frame freckle highway lorry French French bulldog French fries French toast freshener fridge fried chicken fried egg fried rice friendship frisbee frog frost frosting frosty frozen fruit fruit cake fruit dish fruit market fruit salad fruit stand fruit tree fruits shop fry frying pan fudge fuel fume hood fun funeral fungi funnel fur fur coat furniture futon gadget muzzle galaxy gallery game game board game controller ham gang garage garage door garage kit garbage garden garden asparagus garden hose garden spider gardener gardening garfield gargoyle wreath garlic garment gas gas station gas stove gasmask collect gathering gauge gazebo gear gecko geisha gel general store generator geranium ghost gift gift bag gift basket gift box gift card gift shop gift wrap gig gin ginger gingerbread gingerbread house ginkgo tree giraffe girl give glacier gladiator glass bead glass bottle glass bowl glass box glass building glass door glass floor glass house glass jar glass plate glass table glass vase glass wall glass window glasses glaze glider earth glove glow glue pudding go go for goal goalkeeper goat goat cheese gobi goggles gold gold medal Golden Gate Bridge golden retriever goldfish golf golf cap golf cart golf club golf course golfer goose gorilla gothic gourd government government agency gown graduate graduation grain grampus grand prix grandfather grandmother grandparent granite granola grape grapefruit wine grass grasshopper grassland grassy grater grave gravel gravestone gravy gravy boat gray graze grazing green greenery greet greeting greeting card greyhound grid griddle grill grille grilled eel grind grinder grits grocery bag grotto ground squirrel group group photo grove grow guacamole guard guard dog guest house guest room guide guinea pig guitar guitarist gulf gull gun gundam gurdwara guzheng gym gymnast habitat hacker hail hair hair color hair spray hairbrush haircut hairgrip hairnet hairpin hairstyle half hall halloween halloween costume halloween pumpkin halter top hamburg hamburger hami melon hammer hammock hamper hamster hand dryer hand glass hand towel handbag handball handcuff handgun handkerchief handle handsaw handshake handstand handwriting hanfu hang hangar hanger happiness harbor harbor seal hard rock artist hardback book safety helmet hardware hardware store hardwood hardwood floor mouth organ pipe organ harpsichord harvest harvester hassock hat hatbox hautboy hawthorn hay hayfield hazelnut head head coach headlight headboard headdress headland headquarter hearing heart heart shape heat heater heather hedge hedgehog heel helicopter heliport helmet help hen henna herb herd hermit crab hero heron hibiscus hibiscus flower hide high bar high heel highland highlight hike hiker hiking boot hiking equipment hill hill country hill station hillside hindu temple hinge hip hip hop artist hippo historian historic history hockey hockey arena hockey game hockey player hockey stick hoe hole vacation holly holothurian home home appliance home base home decor home interior home office home theater homework hummus honey beehive honeymoon hood hoodie hook jump horizon hornbill horned cow hornet horror horror film horse blanket horse cart horse farm horse ride horseback horseshoe hose hospital hospital bed hospital room host inn hot hot air balloon hot dog hot sauce hot spring hotel hotel lobby hotel room hotplate hourglass house house exterior houseplant hoverboard howler huddle hug hula hoop person humidifier hummingbird humpback whale hunt hunting lodge hurdle hurricane husky hut hyaena hybrid hydrangea hydrant seaplane ice ice bag polar bear ice cave icecream ice cream cone ice cream parlor ice cube ice floe ice hockey player ice hockey team lollipop ice maker rink ice sculpture ice shelf skate ice skating iceberg icicle icing icon id photo identity card igloo light iguana illuminate illustration image impala incense independence day individual indoor indoor rower induction cooker industrial area industry infantry inflatable boat information desk infrastructure ingredient inhalator injection injury ink inking pad inlet inscription insect install instrument insulated cup interaction interior design website intersection interview invertebrate invitation ipad iphone ipod iris iron ironing board irrigation system island islet isopod ivory ivy izakaya jack jackcrab jacket jacuzzi jade jaguar jail cell jam japanese garden jasmine jaw jay jazz jazz artist jazz fusion artist jeans jeep jelly jelly bean jellyfish jet motorboat jewel jewellery jewelry shop jigsaw puzzle rickshaw jockey jockey cap jog joint journalist joystick judge jug juggle juice juicer jujube jump rope jumpsuit jungle junkyard kale kaleidoscope kangaroo karaoke karate karting kasbah kayak kebab key keycard khaki kick kilt kimono kindergarden classroom kindergarten king king crab kiss kit kitchen kitchen cabinet kitchen counter kitchen floor kitchen hood kitchen island kitchen sink kitchen table kitchen utensil kitchen window kitchenware kite kiwi knee pad kneel knife rider knit knitting needle knob knocker knot koala koi ktv laboratory lab coat label labrador maze lace lace dress ladder ladle ladybird lagoon lake lake district lake house lakeshore lamb lamb chop lamp post lamp shade spear land land vehicle landfill landing landing deck landmark landscape landslide lanyard lantern lap laptop laptop keyboard larva lasagne laser lash lasso latch latex latte laugh launch launch event launch party laundromat laundry laundry basket laundry room lava lavender lawn lawn wedding lawyer lay lead lead singer lead to leader leak lean learn leash leather leather jacket leather shoe speech lecture hall lecture room ledge leftover leg legend legging legislative chamber lego legume lemon lemon juice lemonade lemur lens lens flare lentil leopard leotard tights leprechaun lesson letter mailbox letter logo lettering lettuce level library license license plate lichen lick lid lie life belt life jacket lifeboat lifeguard lift light fixture light show light switch lighting lightning lightning rod lilac lily limb lime limestone limo line line art line up linen liner lion lip balm lipstick liquid liquor store list litchi live livestock living room living space lizard load loading dock loafer hallway locate lock lock chamber locker loft log log cabin logo loki long hair longboard loom loop lose lottery lotus love loveseat luggage lumber lumberjack lunch lunch box lush luxury luxury yacht mac macadamia macaque macaroni macaw machete machine machine gun magazine magic magician magnet magnifying glass magnolia magpie mahjong mahout maid chain mail mail slot make makeover makeup artist makeup tool mallard mallard duck mallet mammal mammoth man management manager manatee mandala mandarin orange mandarine mane manga manger mango mangosteen mangrove manhattan manhole manhole cover manicure mannequin manor house mansion mantid mantle manufactured home manufacturing manuscript map maple maple leaf maple syrup maraca marathon marble march marching band mare marigold marine marine invertebrate marine mammal puppet mark market market square market stall marriage martial martial artist martial arts gym martini martini glass mascara mascot mashed potato masher mask massage mast mat matador match matchbox material mattress mausoleum maxi dress meal measuring cup measuring tape meat meatball mechanic mechanical fan medal media medical equipment medical image medical staff medicine cabinet medieval medina meditation meerkat meet melon monument menu mermaid net mess messenger bag metal metal artist metal detector meter mezzanine microphone microscope microwave midnight milestone military uniform milk milk can milk tea milkshake mill mine miner mineral mineral water miniskirt miniature minibus minister minivan mint mint candy mirror miss missile mission mistletoe mix mixer mixing bowl mixture moat mobility scooter model model car modern modern tower moisture mold molding mole monarch money monitor monk monkey monkey wrench monochrome monocycle monster truck moon moon cake moonlight moor moose swab moped morning morning fog morning light morning sun mortar mosaic mosque mosquito moss motel moth mother motherboard motif sport motor motorbike motorcycle motorcycle helmet motorcycle racer motorcyclist motorsport mound mountain mountain bike mountain biker mountain biking mountain gorilla mountain lake mountain landscape mountain pass mountain path mountain range mountain river mountain snowy mountain stream mountain view mountain village mountaineer mountaineering bag mouse mousepad mousetrap mouth mouthwash move movie poster movie ticket mower mp3 player mr mud muffin mug mulberry mulch mule municipality mural muscle muscle car museum mushroom music music festival music stool music studio music video performer musical keyboard musician mussel mustard mythology nacho nail polish nailfile nanny napkin narrow national flag nativity scene natural history museum nature nature reserve navigation navratri navy nebula neck neckband necklace neckline nectar nectarine needle neighbor neighbourhood neon neon light nerve nest new year newborn newfoundland newlywed news news conference newsstand night night market night sky night view nightclub nightstand noodle nose noseband note notebook notepad notepaper notice number icon nun nurse nursery nursing home nut nutcracker oak oak tree oar oasis oast house oatmeal oats obelisk observation tower observatory obstacle course sea octopus offer office office building office chair office cubicle office desk office supply office window officer official oil oil lamp oil painting oilrig okra old photo olive olive oil olive tree omelet onion onion ring opal open opening opening ceremony opera opera house operate operating room operation optical shop orangutan orange orange juice orange tree orangery orbit orchard orchestra pit orchid order organization origami ornament osprey ostrich otter out outcrop outdoor outhouse electric outlet outline oval oven overall overcoat overpass owl oyster teething ring pack package paddock police van padlock paella pagoda pain paint brush painter paisley bandanna palace palette paling pall palm tree pan pancake panda panel panorama pansy pant pantry pants pantyhose papaya paper paper bag paper cutter paper lantern paper plate paper towel paperback book paperweight parachute parade paradise parrot paramedic paraquet parasail paratrooper parchment parish park park bench parking parking garage parking meter parking sign parliament parsley participant partner partridge party party hat pass passage passbook passenger passenger ship passenger train passion fruit passport pasta paste pastry pasture patch patient pattern pavement pavilion paw pay payphone pea peace peach peacock peak peanut peanut butter pear pearl pebble pecan pedestrian pedestrian bridge pedestrian street peel peeler pegboard pegleg pelican pen penalty kick pencil pencil case pencil sharpener pencil skirt pendant pendulum penguin peninsula pennant penny piggy bank peony pepper pepper grinder peppercorn pepperoni perch perform performance performance arena perfume pergola persian cat persimmon personal care personal flotation device pest pet pet shop pet store petal petunia church bench pheasant phenomenon philosopher phone phonebook record player photo photo booth photo frame photography physicist physics laboratory pianist piano plectrum pick up pickle picnic picnic area picnic basket picnic table picture picture frame pie pigeon pilgrim tablet pillow pilot pilot boat pin pine pine cone pine forest pine nut pineapple table tennis table table tennis pink pint pipa pipe pipe bowl pirate pirate flag pirate ship pistachio ski slope pocket bread pitaya pitbull pitch pitcher pitcher plant pitchfork pizza pizza cutter pizza pan pizzeria placard place place mat plaid plain plan planet planet earth plank plant plantation planting plaque plaster plastic plasticine plateau platform platinum platter play play badminton play baseball play basketball play billiard play football play pong play tennis play volleyball player playground playhouse playing card playing chess playing golf playing mahjong playingfield playpen playroom plaza plier plot plow plug plug hat plum plumber plumbing fixture plume plywood pocket pocket watch pocketknife pod podium poetry poinsettia point pointer poker card poker chip poker table pole polecat police police car police dog police station politician polka dot pollen pollution polo polo neck polo shirt pomegranate pomeranian poncho pond ponytail poodle pool pop pop artist popcorn pope poppy porcelain porch pork porridge portable battery portal portfolio porthole portrait portrait session pose possum post post office stamp postcard poster poster page pot potato potato chip potato salad potholder potty pouch poultry pound pour powder power line power plugs and sockets power see power station practice Prague Castle prayer preacher premiere prescription show presentation president press room pressure cooker pretzel prince princess print printed page printer printing prison produce product profession professional professor project picture projection screen projector prom promenade propeller prophet proposal protective suit protest protester publication publicity portrait ice hockey pudding puddle puff puffin pug pull pulpit pulse pump pumpkin pumpkin pie pumpkin seed punch bag punch student purple push putt puzzle tower pyramid python qr code quail quarry quarter quartz queen quesadilla queue quiche quilt quilting quote rabbit raccoon race race track raceway race car racket radar radiator radio raft rag doll rail railcar railroad railroad bridge railway line railway station rain rain boot rainbow rainbow trout raincoat rainforest rainy raisin rake ram ramp rapeseed rapid rapper raspberry rat ratchet raven ravine ray razor razor blade read reading reamer rear rear light rear view rearview mirror receipt receive reception recipe record record producer recorder recording studio recreation room recreational vehicle rectangle recycling recycling bin red red carpet red flag red panda red wine redwood reed reef reel referee reflect reflection reflector register rein reindeer relax release relief religion religious relish remain remodel remote remove repair repair shop reptile rescue rescuer research researcher reservoir residence residential neighborhood resin resort resort town restaurant kitchen restaurant patio restroom retail retriever retro reveal rhinoceros rhododendron rib ribbon rice rice cooker rice field ride ridge riding rifle rim ring riot ripple rise rise building river river bank river boat river valley riverbed road road sign road trip roadside roast chicken robe robin robot stone rock arch rock artist rock band rock climber rock climbing rock concert rock face rock formation rocker rocket rocking chair rocky rodent rodeo rodeo arena roe roe deer roller coaster roller skate roller skates rolling pin romance romantic roof roof garden room room divider root root beer rope bridge rosary rose rosemary rosy cloud rottweiler round table router row rowan royal rubber stamp rubble rubik's cube ruby ruffle rugby rugby ball rugby player ruins ruler rum run runner running shoe rural rust rustic rye sack saddle saddlebag safari safe safety vest sage sail sailboat sailing sailor squirrel monkey sake salad salad bowl salamander salami sale salmon salon salsa salt salt and pepper shakers salt lake salt marsh salt shaker salute samoyed samurai sand sand bar sand box sand castle sand sculpture sandal sandwich sanitary napkin santa claus sapphire sardine sari sashimi satay satchel satellite satin sauce saucer sauna sausage savanna saw sawbuck sax saxophonist scaffold scale scale model scallop scar strawman scarf scene scenery schnauzer school school bus school uniform schoolhouse schooner science science fiction film science museum scientist scissors wall lamp scone scoop scooter score scoreboard scorpion scout scrambled egg scrap scraper scratch screen screen door screenshot screw screwdriver scroll scrub scrubbing brush sculptor sculpture sea cave sea ice sea lion sea turtle sea urchin seabass seabed seabird seafood seahorse seal sea view seashell seaside resort season seat seat belt seaweed secretary security sedan see seed seesaw segway selfie sell seminar sense sensor server server room service set sewing machine shadow shake shaker shampoo shape share shark sharpener sharpie shaver shaving cream shawl shear shears sheep sheet sheet music shelf shell shellfish shelter shelve shepherd sherbert shiba inu shine shipping shipping container shipwreck shipyard shirt shirtless shoal shoe shoe box shoe shop shoe tree shoot shooting basketball guard shop window shopfront shopper shopping shopping bag shopping basket shopping cart mall shopping street shore shoreline short short hair shorts shot glass shotgun shoulder shoulder bag shovel showcase shower shower cap shower curtain shower door shower head shredder shrew shrimp shrine shrub shutter siamese siberia sibling side side cabinet side dish sidecar sideline siding sign signage signal signature silk silk stocking silo silver silver medal silverware sing singe singer sink sip sit sitting skate park skateboard skateboarder skater skating rink skeleton sketch skewer ski ski boot ski equipment ski jacket ski lift ski pole ski resort snowboard skier skiing shoes skin skull skullcap sky sky tower skylight skyline skyscraper slalom slate sleigh sleep sleeping bag sleepwear sleeve slice slide slider sling slope slot slot machine sloth slow cooker slug slum smell smile smoke snack snail snake snapper snapshot snorkel snout snow snow leopard snow mountain snowball snowboarder snowfield snowflake snowman snowmobile snowplow snowshoe snowy soap soap bubble soap dispenser soccer goalkeeper socialite sock socket soda softball software solar battery soldier solo solution sombrero song sound soup soup bowl soupspoon sour cream souvenir soybean milk spa space space shuttle space station spacecraft spaghetti span wrench spark sparkle sparkler sparkling wine sparrow spatula speaker spectator speech bubble speed limit speed limit sign speedboat speedometer sphere spice spice rack spider spider web spike spin spinach spire splash sponge spoon sport association sport equipment sport team sports ball sports equipment sports meet sportswear dot spray spread spring spring roll sprinkle sprinkler sprout spruce spruce forest squad square squash squat squeeze squid squirrel water gun stab stable stack stadium staff stage stage light stagecoach stain stainless steel stair stairs stairwell stall stallion stand standing staple stapler star stare starfish starfruit starling state park state school station stationary bicycle stationery statue steak steak knife steam steam engine steam locomotive steam train steamed bread steel steering wheel stem stencil step stool stereo stethoscope stew stick stick insect sticker still life stilt stingray stir stirrer stirrup sew stock stocking stomach stone building stone carving stone house stone mill stool stop stop at stop light stop sign stop watch traffic light storage box storage room tank store storefront stork storm storm cloud stormy stove poker straddle strainer strait strap straw straw hat strawberry stream street art street artist street corner street dog street food street light street market street photography street scene street sign street vendor stretch stretcher strike striker string string cheese strip stripe stroll structure studio studio shot stuff stuffed animal stuffed toy stuffing stump stunning stunt stupa style stylus submarine submarine sandwich submarine water suburb subway subway station subwoofer succulent suede sugar sugar bowl sugar cane sugar cube suit suite summer summer evening summit sun sun hat sunbathe sunday sundial sunflower sunflower field sunflower seed sunglasses sunny sunrise sunset sunshade sunshine super bowl sports car superhero supermarket supermarket shelf supermodel supporter surf surface surfboard surfer surgeon surgery surround sushi sushi bar suspenders suspension suspension bridge suv swallow swallowtail butterfly swamp swan swan boat sweat pant sweatband sweater sweatshirt sweet sweet potato swim swim cap swimmer swimming hole swimming pool swing swing bridge swinge swirl switch swivel chair sword swordfish symbol symmetry synagogue syringe syrup system t shirt t-shirt tabasco sauce tabby table tennis racket table top tablecloth tablet computer tableware tachometer tackle taco tae kwon do tai chi tail tailor take takeoff talk tambourine tan tangerine tape tapestry tarmac taro tarp tart tassel taste tatami tattoo tattoo artist tavern tea tea bag tea party tea plantation tea pot tea set teach teacher teacup teal team photo team presentation tear technician technology teddy tee teenager telegraph pole zoom lens telescope television television camera television room television studio temperature temple tempura tennis tennis court tennis match tennis net tennis player tennis racket tent tequila terminal terrace terrain terrarium territory test test match test tube text text message textile texture thanksgiving thanksgiving dinner theater theatre actor therapy thermometer thermos thermos bottle thermostat thicket thimble thing thinking thistle throne throne room throw throw pillow thunder thunderstorm thyme tiara tick ticket ticket booth tide pool tie tiger tight tile tile flooring tile roof tile wall tin tinfoil tinsel tiramisu tire tissue toast toaster tobacco tobacco pipe toddler toe tofu toilet bowl toilet seat toiletry tokyo tower tomato tomato sauce tomato soup tomb tong tongs tool toolbox toothbrush toothpaste toothpick topiary garden topping torch tornado tortilla tortoise tote bag totem pole totoro toucan touch touchdown tour tour bus tour guide tourist tourist attraction tournament tow truck towel towel bar tower block tower bridge town town square toy toy car toy gun toyshop track tractor trade tradition traditional traffic traffic cone traffic congestion traffic jam traffic sign trail trailer trailer truck train train bridge train car train interior train track train window trainer training training bench training ground trolley trampoline transformer transparency travel tray treadmill treat tree tree branch tree farm tree frog tree house tree root tree trunk trial triangle triathlon tribe tributary trick tricycle trim trio tripod trombone troop trophy trophy cup tropic trout truck truck driver tub tube tugboat tulip tuna tundra tunnel turbine turkey turn turnip turquoise turret turtle tusk tv actor tv cabinet tv drama tv genre tv personality tv show tv sitcom tv tower twig twilight twin twine twist type type on typewriter ukulele ultraman umbrella underclothes underwater unicorn uniform universe university up urban urinal urn use utensil utility room vacuum valley valve vampire van vanilla vanity variety vase vault vector cartoon illustration vector icon vegetable vegetable garden vegetable market vegetation vehicle veil vein velvet vending machine vendor vent vespa vessel vest vet veteran veterinarians office viaduct video video camera video game videotape view mirror vigil villa village vine vinegar vineyard violence violet violin violinist violist vision visor vodka volcano volleyball volleyball court volleyball player volunteer voyage vulture waffle waffle iron wagon wagon wheel waist waiter waiting hall waiting room walk walking walking cane wall clock wallpaper walnut walrus war warehouse warm warning sign warrior warship warthog wash washer washing washing machine wasp waste waste container watch water water bird water buffalo water cooler water drop water feature water heater water level water lily water park water pipe water purifier water ski water sport water surface water tower watercolor watercolor illustration watercolor painting waterfall watering can watermark overlay stamp watermelon waterproof jacket waterway wave wax weapon wear weather vane web webcam wedding wedding ring wedding bouquet wedding cake wedding couple wedding invitation wedding party wedding photo wedding photographer wedding photography wedding reception wedge weed weight weight scale welder well western food western restaurant wet wet bar wet suit wetland wetsuit whale whale shark wheat wheat field wheel wheelchair wheelie whipped cream whisk whisker whiskey whistle white white house white wine whiteboard wicket wide wield wig Wii Wii controller wild wildebeest wildfire wildflower wildlife willow wind wind chime wind farm wind turbine windmill window window box window display window frame window screen window seat window sill wiper windshield windy wine bottle wine cooler wine cabinet wine cellar wine glass wine rack wine tasting winery wing winter winter melon winter morning winter scene winter sport winter storm wire wisteria witch witch hat wok wolf woman wood wood duck wood floor wood wall wood-burning stove wooden spoon woodland woodpecker woodworking plane wool job work card workbench worker workplace workshop world worm worship wound wrap wrap dress wrapping paper wrestle wrestler wrinkle wristband write writer writing writing brush writing desk yacht yak yard yellow yoga yoga mat yoghurt yoke yolk youth youth hostel yurt zebra zebra crossing zen garden zip zipper zombie zongzi zoo ================================================ FILE: ram/data/ram_tag_list_chinese.txt ================================================ 三维CG渲染 3d眼镜 算盘 鲍鱼 修道院 肚子 学院 附件 事故 手风琴 橡子 丙烯颜料 表演 行动 动作电影 活动 演员 改编本 添加 胶带 调整 成人 冒险 广告 天线 有氧运动 喷雾罐 爆炸头 农业 帮助 空调 空调系统 风向标 飞机客舱 飞机模型 机场 航线 客机 飞行员 飞机 飞机窗口 机场 机场跑道 航站楼 飞艇 航展 过道 警报 闹钟 信天翁 唱片 唱片封面 酒精 壁龛 水藻 胡同/球道 杏仁 芦荟 高山 羊驼 字母表 德国牧羊犬 圣坛 琥珀 救护车 秃鹰 美国短毛猫 紫水晶 圆形剧场 扩音器 游乐园 游乐设施 锚 古老的 海葵 天使 角 动物 动物雕塑 动物收容所 动画片 动画电影 动画师 动漫 脚踝 短袜 周年庆 风衣 蚂蚁 羚羊 古董 鹿角 铁砧 公寓 猿 应用程序 应用图标 出现 外观 开胃菜 掌声 苹果 苹果汁 苹果派 苹果树 苹果酱 设备 约定 通道 杏子 围裙 浅绿色 水族馆 观赏鱼 渡槽 游乐中心 商场游戏机 拱门 拱桥 考古现场 射箭 群岛 建筑师 建筑设计 档案 拱门 地区 竞技场 争论 手臂 穿山甲 臂章 扶手椅 衣柜 盔甲 军队 军事基地 坦克 阵列 逮捕 箭头 艺术 艺术展 美术馆 艺术印刷品 艺术学校 艺术工作室 艺术矢量插图 洋蓟 文章 手工艺品 艺术家 艺术阁楼 灰 烟灰缸 亚洲寺庙 芦笋 沥青道路 组装 集会 生产流水线 协会 宇航员 天文学家 运动员 运动 地图集 自助取款机 大气层 中庭 连接 战斗机 参加 吸引力 全地形车 茄子 拍卖 奥迪汽车 音频 礼堂 极光 作者 汽车厂 汽车修理工 汽车零件 车展 汽车展厅 汽车电池 汽车制造 汽车模型 汽车 秋天 秋天的森林 秋天的叶子 秋天的公园 秋天的树 阿凡达 林荫大道 飞行员太阳镜 牛油果 奖品 颁奖典礼 获奖者 棚 斧头 杜鹃花 狒狒 婴儿 奶瓶 婴儿车 婴儿衣服 小象 婴儿食品 婴儿座椅 迎婴派对 背后/后面 背景 背光 背包 后院 培根 徽章 獾 荒地 羽毛球运动 羽毛球拍 袋子 面包圈 风笛 法棍 诱饵 焙烤食品 面包师 面包店 烘焙 烤盘 平衡 平衡车 阳台 球 球池 芭蕾舞女演员 芭蕾舞 芭蕾舞演员 芭蕾舞裙 气球 气球拱门 棒球手 舞厅 竹子 竹林 香蕉 香蕉面包 香蕉叶子 香蕉树 乐队 创可贴 绷带 头巾 束发带 刘海 手镯 栏杆 五弦琴 银行 银行卡 银行金库 纸币 横幅/旗帜 宴会 宴会厅 榕树 包子 洗礼 酒吧 条形码 高脚凳 烧烤 烧烤架 杠铃 理发师 理发店 芭比娃娃 驳船 咖啡师 树皮 大麦 谷仓 仓鸮 挡光板 桶 路障 屏障 手推车 酒保 棒球 棒球基地 棒球棒 棒球帽 棒球场 棒球比赛 棒球手套 棒球投手 棒球队 棒球制服 地下室 罗勒 水盆 篮子 篮子 篮球 篮球篮板 篮球教练 篮球场 篮球比赛 篮球框 篮球运动员 篮球馆 篮球队 贝斯 低音吉他 低音喇叭 贝斯手 球棒/球拍 浴室 水浴加热器 浴垫 浴巾 泳装 浴袍 浴室 浴室配件 浴室柜 浴室门 浴室镜子 浴室水槽 卫生纸 浴室窗户 蝙蝠侠 棒子 接连猛打/击球员 电池 战斗 战绳 战舰 海湾 海湾大桥 凸窗 杨梅 集市 海滩 沙滩球 沙滩椅 海滨别墅 海滩小屋 沙滩毛巾 沙滩排球 灯塔 珠子 比格犬 鸟嘴 烧杯 横梁 豆子 豆袋椅 豆袋 熊 幼熊 胡子 野兽 击打/击败 美丽的 美丽 美容院 海狸 床 床单 床架 卧室 床上用品 便盆 卧室窗户 床头灯 蜜蜂 山毛榉 牛肉 养蜂人 蜂鸣器 啤酒 啤酒瓶 啤酒罐 啤酒花园 啤酒杯 啤酒馆 甜菜 甲虫 米色 时钟 甜椒 钟楼 皮带 皮带扣 长凳 弯曲 孟加拉虎 盒饭 贝雷帽 浆果 停泊位 饮料 围嘴 拌饭 圣经 比熊 自行车 自行车头盔 自行车车轮 自行车骑士 坐浴盆 大本钟 自行车道 自行车道 自行车赛 骑车 比基尼 比基尼上衣 账单 台球 广告牌 台球台 垃圾箱 活页夹 双筒望远镜 生物学实验室 双翼飞机 桦木 桦树 鸟 鸟池 喂鸟器 鸟舍 鸟巢 鸟池 鸟笼 出生 生日 生日蛋糕 生日蜡烛 生日贺卡 生日聚会 饼干 主教 野牛 钻头 咬 黑色 黑山羊 黑莓 乌鸦 黑板 铁匠 叶片/刀片 毯子/覆盖层 运动外套 看台 搅拌机 祝福 窗帘 眼罩 闪光 暴风雪 块 博客 血 开花 花 女装衬衫 吹 吹风机 河豚 蓝色 蓝色艺术家 蓝松鸦 蓝天 蓝莓 蓝知更鸟 猪 板子 板擦 棋盘游戏 木板路 船 船甲板 船屋 桨 乘船 浮标 山猫 躯干 身体冲浪板 健美运动员 水煮鸡蛋 锅炉 饰扣式领带 门闩 炸弹 轰炸机 披肩榛鸡 骨骼 篝火 阀盖 盆景 书 书籍封面 书柜 文件夹 书签 书架 书店 远程拾音器 推动 靴子 边界 边境牧羊犬 植物园 瓶 瓶盖 开瓶器 螺旋开瓶器 三角梅 巨石 花束 时装店 精品酒店 鞠躬/蝴蝶结 领结 弓形窗 碗 保龄球运动 保龄球馆 保龄球 保龄球设备 盒子 箱形梁桥 箱龟 拳击手 内裤 拳击 拳击手套 拳击台 男孩 支撑物 支架 辫子 大脑 刹车 刹车灯 树枝 商标 白兰地 黄铜 黄铜牌匾 面包 面包箱 休息 早餐 防浪堤 胸部 啤酒厂 砖块 砖建筑物 墙 砖块 婚纱 新娘 新郎 伴娘 桥 缰绳 公文包 明亮的 边沿 钻头 广播 西兰花 青铜 铜牌 青铜雕塑 青铜雕像 胸针 小溪 扫帚 肉汤 棕色 棕熊 巧克力蛋糕 早午餐 浅黑肤色的女人 刷子 郊狼 包菜 气泡 泡泡糖 珍珠奶茶 斗柜 盾牌 芽 佛 水牛 自助餐 昆虫 建造 建造者 建筑 积木 建筑立面 建筑材料 灯 牛 斗牛犬 子弹 动车 公告栏 防弹背心 斗牛 扩音器 斗牛场 大黄蜂 保险杠 卷/地形起伏 捆 蹦极 双层床 地堡/击球 兔子 浮标 书桌 墓室 燃烧 玉米煎饼 公交车 公交车司机 公交车内部 公交车站 公交车站 公交车窗户 灌木 商业 名片 业务主管 商务西装 业务团队 女商人 商人 半身像 屠夫 肉铺 孤峰 黄油 奶油 蝴蝶 蝴蝶馆 按钮 梧桐树 购买 出租车 小屋 卷心菜 小屋/机舱 守车 储藏柜 橱柜 电缆 缆车 仙人掌 咖啡馆 食堂 笼子 蛋糕 蛋糕台 计算器 大锅 日历 小腿 通话 电话亭 书法 平静的 摄像机 骆驼 相机 相机镜头 迷彩 露营 露营者 篝火 露营 营地 校园 罐 开罐器 运河 金丝雀 癌症 蜡烛 烛台 糖果 块状糖 柺杖糖 糖果店 拐杖 罐子 大炮 树冠/顶棚 四柱床 香瓜 悬臂桥 帆布 峡谷 帽子 斗篷 科德角 卡布奇诺 胶囊 队长 捕获 车 汽车经销商 车门 汽车内饰 车标 后视镜 停车场 汽车座椅 车展 洗车 车窗 焦糖 卡片 纸牌游戏 纸板 纸板盒 羊毛衫 红衣凤头鸟 货物 货运飞机 货船 加勒比 康乃馨 狂欢节 食肉动物 旋转木马 鲤鱼 木匠 地毯 拖鞋 红雀 长途客车 斑点狗 航空母舰 胡萝卜 胡萝卜蛋糕 携带 手推车 纸箱/纸盒 卡通 卡通人物 卡通插图 卡通风格 雕刻 容器 现金 腰果 赌场 砂锅 磁带 盒式录音机 石膏绷带 铸造 城堡 猫 猫窝 猫粮 猫器具 猫架 地下墓穴 双体船 美洲狮 握着/抓着 捕手 毛毛虫 鲶鱼 教堂 牛 猫步 走秀 菜花 洞穴 鱼子酱 光盘 CD播放器 雪松 天花板 吊扇 庆祝 庆典 名人 芹菜 大提琴 手机 水泥 墓地 中心装饰品 蜈蚣 陶瓷 瓷砖 麦片 仪式 证书 链条 链锯 椅子 升降椅 躺椅 木屋 圣杯 粉笔 房间 变色龙 香槟酒 香槟杯 冠军 锦标赛 吊灯 婴儿换尿布台 通道 皴裂处 小教堂 人物雕塑 木炭 充电 充电器 战车 慈善机构 慈善活动 魅力 图表 追逐 底盘 检查/支票 支票簿 棋盘 检查表 欢呼声 鼓励/啦啦队 奶酪 奶酪汉堡 奶酪蛋糕 猎豹 厨师 化合物 化学家 化学 化学实验室 旗袍 樱桃 樱花 樱桃番茄 樱桃树 国际象棋 栗子 鸡 鸡胸肉 鸡笼 鸡肉沙拉 鸡翅 鹰嘴豆 小衣橱 吉娃娃 孩子 童星 孩子的房间 红番椒 辣热狗 烟囱 黑猩猩 瓷器 白菜 中国园林 中国结 月季 中国塔 炸薯条/炸薯条 花栗鼠 凿子 巧克力 巧克力棒 巧克力蛋糕 巧克力碎片 巧克力饼干 巧克力牛奶 巧克力慕斯 松露 唱诗班 厨房刀 砧板 筷子 圣诞节 圣诞球 圣诞贺卡 圣诞装饰 圣诞晚宴 平安夜 圣诞帽 圣诞灯 圣诞市场 圣诞装饰 圣诞树 菊花 教堂 教堂塔 苹果酒 雪茄 雪茄盒 香烟 烟盒 腰带 电影院 摄影师 肉桂 圆 电路 电路板 马戏团 水箱 柑橘类水果 城市 城市公交 市政厅 城市夜景 城市公园 城市天际线 城市广场 城市街道 城墙 城市景观 蛤蜊 单簧管 扣子 班级 经典 教室 锁骨 爪子 黏土 陶器 清洁 洁净室 清洁工人 清洁用品 清晰的 栓 克莱门氏小柑橘 客户端 悬崖 爬 爬山 登山者 诊所 夹子 剪贴画 剪贴板 快速帆船 君子兰 斗篷 木底鞋 特写 壁橱 布 穿衣 衣服 晒衣夹 晒衣绳 服装店 云 云雾森林 多云 三叶草 小丑 小丑鱼 俱乐部 离合器 手拿包 煤炭 海岸 外套 衣帽架 玉米 公鸡 凤头鹦鹉 可卡犬 驾驶 蟑螂 鸡尾酒 小礼服 鸡尾酒调制器 鸡尾酒桌 可可 椰子 椰子树 咖啡 咖啡豆 咖啡杯 咖啡机 咖啡店 咖啡壶 棺材 法国白兰地 螺旋 硬币 可口可乐 滤器 冷的 卷心菜沙拉 合作 拼贴画 收藏品 大学生 牧羊犬 碰撞 颜色 涂色书 染色材料 矮种马 柱子 梳子 密码锁 喜剧演员 喜剧 喜剧电影 彗星 舒服 安慰食物 漫画书 漫画人物 连环画 指挥官 评论员 社区 通勤 公司 指南针 比赛 比赛 竞争者 作曲家 作文 堆肥 电脑 电脑机箱 电脑椅 电脑桌 键盘 计算机显示器 计算机房 电脑屏幕 机箱 概念车 音乐会 音乐厅 贝壳 混凝土 调味品 避孕套 独立产权的公寓 指挥 锥形物 会议 会议中心 会议厅 会议室 五彩纸屑 冲突 合流 连接 连接器 温室 星座 建筑工地 建筑工人 包含 容器 集装箱船 大陆 轮廓 合同 控制 控制塔 便利店 集会 交谈 转换器 可转换的 输送机 厨师/烹饪 烹饪 烹饪喷雾剂 炊具 凉的 冷却器 铜 一本/一册 珊瑚 珊瑚礁 粗绳 有线电话 酒 威尔士矮脚狗 瓶塞 软木板 鸬鹚 玉米 玉米田 玉米面包 角落 小号 飞檐 燕麦片 围栏 走廊 紧身衣 化妆品 化妆刷 化妆镜 角色扮演 服装 服装电影设计师 婴儿床 小屋 棉花 棉花糖 沙发 倒计时 柜台 台面 最佳乡村歌手 乡村别墅 乡村公路 乡村流行歌手 农村 双门小轿车 夫妇/两人/几个 情侣写真 小胡瓜 课程 球场 法院 院子 堂兄弟 工作服 奶牛 母牛的颈铃 牛仔 牛仔靴 牛仔帽 螃蟹 蟹肉 裂纹 摇篮 工艺 工匠 蔓越莓 起重机 黑纱 厕所 板条箱 火山口湖 龙虾 蜡笔 奶油乳酪 奶油罐 创建 生物 信用卡 新月形 新月形面包 山顶 全体船员 蟋蟀 板球用球 板球队 板球队员 钩边 克罗克电锅 鳄鱼 庄稼 露脐上衣 交叉 横木 十字路口 相声 人行横道 油煎面包块 乌鸦 撬棍 人群 拥挤的 皇冠 阴极射线管屏幕 耶稣受难像 巡游 游轮 巡洋艇 面包屑 压坏 拐杖 水晶 幼兽 立方体 黄瓜 球杆 袖口 袖扣 烹饪 农田 杯子 纸杯蛋糕 丘比特 马路牙子 旋度 卷发器 无籽葡萄干 货币 咖喱 窗帘 曲线 软垫 顾客 切 餐具 自行车 骑自行车 龙卷风 汽缸 铙钹 柏树 柏树 达克斯猎狗 水仙花 匕首 大丽花 萝卜 乳制品 雏菊 大坝 损害 潮湿的 跳舞 舞池 舞蹈室 舞者 蒲公英 黑暗 黑暗 飞镖 圆靶 指示板 日期 女儿 黎明 天床上 日光 门栓 死亡 辩论 碎片 玻璃水瓶 甲板 双层巴士 装饰 装修/装饰 装饰画 鹿 后卫 神 熟食 投递 拆迁 怪兽 演示 兽窝/休闲室 牛仔夹克 牙医 百货商店 抑郁症 德比 皮肤病 沙漠 沙漠公路 设计 设计师 桌子/表格 台灯 桌面 台式电脑 甜点 破坏 侦探 洗涤剂 露水 仪表盘 钻石 尿布 尿布包 杂志 死 饮食 挖掘机 数字 数字时钟 莳萝 晚餐 小船 餐厅 晚宴 餐桌 恐龙 浸 文凭 指引 导演 尘埃 越野摩托车 泥土地 泥土路 泥路/土路 灾难 信徒 迪斯科舞厅 迪斯科灯秋 迪斯科舞厅 疾病 盘子 碟形天线 洗碗机 抹布 菜肴 洗碗液 迪斯尼乐园 自动售货机 展示 陈列窗 壕沟 潜水 潜水员 跳水板 纸杯 流行音乐播音员 杜宾犬 码头 医生 文件 纪录片 狗 狗窝 犬种 狗项圈 狗粮 狗窝 洋娃娃 美元 玩偶之家 洋娃娃 海豚 穹顶 住宅 多米诺骨牌 驴 甜甜圈 涂鸦 门 门把手 受气包 门牌 门口 宿舍 面团 市中心 推土机 拖 龙 蜻蜓 排水沟 剧本 戏剧电影 画 抽屉里 图画/画画 图钉 辫子 连衣裙/特定场合的服装 礼帽 正装衬衫 皮鞋 大礼服 梳妆台 更衣室 运球 漂移 浮木 钻 饮品/喝 饮用水 开车 司机 车道 无人机 水滴/下降 吊灯 滴管 干旱 药物 药店 鼓 鼓手 鸡腿 干的 公爵夫人 鸭子 鸭嘴兽 小鸭子 布基胶带 伙计 二重唱 粗呢 独木舟 哑铃 饺子 沙丘 扣篮 榴莲 黄昏 灰尘 垃圾车 簸箕 羽绒被 DVD 染料 鹰 耳朵 御寒耳罩 耳机 耳塞 耳环 地震 画架 复活节 复活节兔子 复活节彩蛋 吃 餐厅 泡芙 日食 生态系统 编辑 教育 教育家 鳗鱼 蛋 蛋卷 蛋挞 打蛋器 白鹭 埃菲尔铁塔 橡皮筋 上级 电椅 电钻 电工 电 电子 电子器件 大象 高度图 电梯 电梯轿厢 电梯门 电梯大堂 电梯井 路堤 大使馆 装饰 灰烬 会徽 刺绣 翡翠 紧急 紧急服务 紧急车辆 情感 帝国大厦 搪瓷 外壳/围墙 茶几 能源 订婚 订婚戒指 引擎 机舱 工程师 工程 英国短毛猫 乐团 回车键 演艺人员 娱乐 娱乐中心 入口 入口大厅 信封 马术 设备 橡皮擦 二胡 侵蚀 自动扶梯 食用蜗牛 浓缩咖啡 房地产 河口 桉树 晚上 晚礼服 夜光 傍晚天空 晚上的太阳 事件 常绿的 母羊 挖掘 运动 排气罩 展览 出口 探险者 爆炸 延长线 灭火器 排气扇 挤压 眼睛 眼影 眉 眼线笔 布料 纺织品商店 外观 脸 脸部特写 蜜粉 毛巾 面巾纸架 设施 工厂 工厂车间 集市 露天市场 仙女 猎鹰 秋天 家庭 家庭轿车 全家福 家庭房 风扇/扇子 尖牙 农场 农民 农民市场 农舍 时尚 时尚配饰 时装设计师 时尚的女孩 时装插图 时装大片 时装模特 时装表演 快餐 西式快餐 父亲 水龙头 故障 动物 小鹿 传真 宴会 羽毛 软呢帽 饲料 一餐 饲养 喂养的椅子 猫科 美洲狮 栅栏 芬达 蕨类植物 雪貂 摩天轮 渡船 肥料 节日 纤维 小说 小说书 田野/场地/野外 田间道路 无花果 打架 花样滑冰运动员 小雕像 文件 档案照片 文件柜 填满 胶片相机 电影导演 电影格式 电影首映礼 电影制片人 拍摄 过滤器 鳍 手 终点线 冷杉 冷杉树 火 火灾报警 消防部门 消防车 消防通道 消防水带 火坑 消防站 爆竹 消防队员 壁炉 烟花 烟花表演 急救箱 鱼 鱼船 海鲜市场 鱼塘 鱼缸 渔夫 钓鱼 渔船 渔网 钓鱼 渔村 健身 健身课程 五个 固定装置 峡湾 国旗 旗杆 小薄片 火焰 火烈鸟 法兰绒 拍打 耀斑 闪光 烧瓶 平 比目鱼 风味 跳蚤 跳蚤市场 舰队 飞行 空中乘务员 翻转 触发器 翻转图 浮动 群 洪水 地板/地面 落地扇 脚垫 楼层平面图 落地窗 插花艺术 花店 牙线 面粉 流动 花 花篮 花坛 花箱 花田 花童 花卉市场 流体 冲洗 长笛 飞 飞行钓鱼 传单 马 泡沫 雾 多雾的 鹅肝酱 箔纸 折椅 树叶 民间艺术家 民间舞蹈 民间摇滚艺术家 方旦糖 火锅 圣洗池 食物 食用色素 美食广场 食品加工机 小吃摊 快餐车 桌上足球 脚 人行桥 足球 足球教练 大学橄榄球赛 足球比赛 足球场 足球比赛 橄榄球头盔 足球运动员 足球场 足球队 小路 脚印 脚踏板 台座 鞋子 故宫 浅滩 额头 森林 森林大火 森林地面 森林小路 森林公路 锻造 餐叉 叉车 表格 园林 队列/形成物 F1方程式赛车 堡垒 碉堡 追逐 化石 粉底 喷泉 钢笔 狐狸 框架 雀斑 高速公路 卡车 法国 法国斗牛犬 薯条 法式吐司 化妆水 冰箱 炸鸡 煎蛋 炒饭 友谊 飞盘 青蛙 霜 结霜 严寒 结冰 水果 水果蛋糕 水果盘 水果市场 水果沙拉 水果摊 果树 水果商店 油炸食品 煎锅 软糖 燃料 吸烟罩 有趣的 葬礼 真菌 漏斗 毛皮衣服 毛皮大衣 家具 蒲团 小工具 枪口 星云/星系 美术馆 游戏 游戏棋盘 游戏手柄 火腿 团伙 车库 车库门 手工模型 垃圾 花园 花园芦笋 橡胶软管 花园蜘蛛 园丁 园艺 加菲猫 滴水嘴 花环 大蒜 衣服 气体 加油站 煤气炉 防毒面具 收集 聚集 测量仪器 露台 齿轮 壁虎 艺妓 凝胶 百货商店 发电机 天竺葵 幽灵 礼物 礼品袋 礼品篮 礼物盒 礼品卡 礼品商店 礼物包装 演唱会 杜松子酒 姜 姜饼 姜饼屋 银杏树 长颈鹿 女孩 给 冰川 角斗士 玻璃珠 玻璃瓶 玻璃碗 玻璃箱 玻璃建筑 玻璃门 玻璃地板 玻璃屋 玻璃罐 玻璃板 玻璃桌子 玻璃花瓶 玻璃墙 玻璃窗 眼镜 光滑面 滑翔机 地球 手套 发光 汤圆 去 袭击 球门 守门员 山羊 羊奶酪 戈壁 护目镜/墨镜 黄金 金牌 金门大桥 金毛猎犬 金鱼 高尔夫运动 高尔夫球帽 高尔夫球车 高尔夫球杆 高尔夫球场 高尔夫球手 鹅 大猩猩 哥特式 葫芦 政府 政府机构 礼服 毕业生 毕业典礼 谷物 逆戟鲸 大奖赛 祖父 祖母 祖父母 花岗岩 格兰诺拉麦片 葡萄 西柚 葡萄酒 草 蚱蜢 草原 长满草的 擦菜器 坟墓 碎石 墓碑 肉汁 调味汁瓶 灰色 吃草 放牧 绿色 绿色植物 欢迎 问候 贺卡 灰狗 网格 筛子 烧烤架 格栅 烤鳗鱼 磨 研磨机 粗燕麦粉 杂货袋 洞穴 地松鼠 群体 合影 小树林 生长 牛油果酱 警卫 看门狗 宾馆 客房 指南 豚鼠 吉他 吉他手 海湾 海鸥 枪 高达 谒师所 古筝 健身房 体操运动员 栖息地 黑客 冰雹 头发 头发颜色 发胶 毛刷 发型 发夹 发网 发夹 发型 一半 礼堂 万圣节 万圣节服装 万圣节南瓜 露背装 汉堡 汉堡包 哈密瓜 锤子 吊床 阻碍 仓鼠 烘手机 放大镜 擦手巾 手提包 手球 手铐 手枪 手帕 把手 手锯 握手 倒立 手写 汉服 悬挂 飞机库 衣架 幸福 海港 斑海豹 硬摇滚艺术家 精装书 建筑工人 硬件 五金店 硬木 硬木地板 口琴 管风琴 羽管键琴 收获 收割机 坐垫/搁脚凳/草丛 帽子 帽盒 双簧管 山楂 干草 干草地 榛子 头 主教练 大灯 床头板 头饰 海岬 总部 听力 心脏 心形 热能 加热器 帚石楠 树篱 刺猬 脚后跟 直升机 直升机机场 头盔 帮助 母鸡 指甲花 药草 兽群 寄居蟹 英雄 苍鹭 芙蓉花 芙蓉花 隐藏/隐蔽处 高杠 高跟鞋 高地 突出 徒步旅行 徒步旅行者 徒步靴 登山设备 山丘 丘陵地 别墅 山坡 印度教寺庙 铰链 臀部 嘻哈艺人 河马 历史学家 历史遗迹 历史 曲棍球 冰球馆 曲棍球比赛 曲棍球运动员 曲棍球棒 锄头 洞 假日 冬青树 海参 家/住宅 家用电器 基地 家居装饰 室内设计 内政部 家庭影院 家庭作业 鹰嘴豆泥 蜂蜜 蜂窝 蜜月 风帽 连帽衫 挂钩/勾住 跳 地平线 犀鸟 长角牛 大黄蜂 震惊 恐怖电影 马鞍褥 马车 马场 骑马 马背 马蹄铁 软管 医院 医院病床 病房 主持人 小旅馆 热 热气球 热狗 辣椒酱 温泉 旅馆 酒店大堂 酒店房间 电炉 沙漏 房子 房子外部 室内植物 悬滑板 吼 蜷缩 拥抱 呼啦圈 人 增湿器 蜂鸟 座头鲸 打猎 狩猎小屋 障碍 飓风 哈士奇 小屋 鬣狗 混合物 绣球花 消火栓 水上飞机 冰 冰袋 北极熊 冰洞 冰淇淋 冰淇淋蛋卷 冰淇淋商店 冰块 浮冰 冰球运动员 冰球队 棒棒糖 制冰机 溜冰场 冰雕 冰架 溜冰鞋 滑冰 冰山 冰柱 糖衣/酥皮 图标 身份证照片 身份证 冰屋 光/灯光/光线 鬣蜥蜴 照亮 插图 形象 黑斑羚 熏香 独立日 个人 室内 划船器 电磁炉 工业区 工业 步兵 充气艇 服务台 基础设施 成分 吸入器 注射 受伤 墨水 印泥 小湖湾 题词 昆虫 安装 乐器/器械 绝缘杯 互动 室内设计 网站 十字路口 面试 无脊椎动物 邀请 平板电脑 苹果手机 苹果音乐播放器 虹膜 铁 熨衣板 灌溉系统 岛 小岛 等足类动物 象牙 常青藤 居酒屋 千斤顶 帝王蟹/蟹 夹克衫 按摩浴缸 玉 美洲虎 监狱牢房 果酱 日式花园 茉莉花 下巴 松鸦 爵士乐 爵士乐艺术家 爵士融合艺术家 牛仔裤 吉普车 果冻 果冻豆 水母 喷气式飞机 摩托艇 珠宝 珠宝 珠宝店 拼图游戏 人力车 赛马骑师 赛马帽 慢跑 联合的 记者 操纵杆 法官 水壶 玩杂耍 果汁 榨汁器 枣子 跳绳 连身裤 丛林 废品堆放场 羽衣甘蓝 万花筒 袋鼠 卡拉ok 空手道 卡丁车运动 旧城区 皮船 烤肉串 按键/钥匙 门卡 卡其色 踢 苏格兰裙 和服 幼儿园教室 幼儿园 国王 帝王蟹 亲吻 工具包 厨房 厨房橱柜 厨房台面 厨房地板 厨房抽油烟机 厨房岛 厨房水槽 厨房桌子 厨房用具 厨房窗户 厨房用具 风筝 猕猴桃 护膝 跪下 餐刀 骑手 编织 编织针 球形把手 门环 结 考拉 锦鲤 ktv 实验室 实验室外套 标签 拉布拉多 迷宫 网眼织物 蕾丝连衣裙 梯子 长柄杓 瓢虫 环礁湖 湖泊 湖区 湖边小屋 湖岸 羊肉 羊排 灯柱 灯罩 矛 土地 陆地车辆 废物填埋 着陆 降落甲板 地标 风景 山崩 挂带 灯笼 腿/大腿 笔记本电脑 笔记本键盘 幼体 烤宽面条 激光 睫毛 套索 门闩 乳胶 拿铁咖啡 笑 发射 发布会 举办会议 自助洗衣店 洗衣房 洗衣篮 洗衣房 熔岩 薰衣草 草坪 草坪婚礼 律师 躺 引领 主唱 通向 领袖 泄漏 倾斜/倚靠 学习 皮带 皮革 皮夹克 皮鞋 演讲 演讲厅 教学室 窗台 剩饭 腿 传说 紧身裤/秋裤 立法院 乐高 豆类 柠檬 柠檬汁 柠檬水 狐猴 镜头 眩光 扁豆 豹 紧身连衣裤 紧身裤袜 小妖精 课程 信函 信箱 信的标志 刻字 生菜 水平 图书馆 许可证 车牌 地衣 舔 盖子 躺着 安全带 救生衣 救生艇 救生员 提起 灯具 灯光秀 电灯开关 照明/照明设备 闪电 避雷针 淡紫色 百合 肢体 石灰 石灰石 豪华轿车 线条 艺术线条 排队 亚麻 邮轮 狮子 润唇膏 口红 液体 酒类商店 列表 荔枝 生活 家畜 客厅 生活空间 蜥蜴 负载 装卸码头 游手好闲的人 走廊 定位 锁 闸室 储物柜 阁楼 原木 小木屋 标志 洛基 长头发 冲浪板 隐约显现/织布机 环状 遗失 彩票 莲花 爱 双人沙发 行李 木材 伐木工人 午餐 午餐盒 郁郁葱葱的 奢侈品 豪华游艇 雨衣 澳洲胡桃 短尾猿 通心粉 金刚鹦鹉 弯刀 机器 机枪 杂志 魔法 魔术师 磁铁 放大镜 木兰花 喜鹊 麻将 象夫 女仆 邮件 邮件槽 制作 改造 化妆师 化妆工具 野鸭 野鸭 槌棒 哺乳动物 猛犸象 男人 管理 经理 海牛 曼荼罗 橘子 普通话 鬃毛 漫画 食槽 芒果 山竹果 红树林 曼哈顿 检修孔 井盖 修指甲 人体模型 庄园主宅 大厦 螳螂 地幔 活动房层 制造业 手稿 地图 枫木 枫叶 枫糖浆 沙球 马拉松 大理石 行进 行进乐队 母马 金盏花 水兵 海洋无脊椎动物 海洋哺乳动物 木偶 标志 集市 市场广场 市场摊位 结婚 武术 武术家 武术馆 马提尼 马丁尼酒杯 睫毛膏 吉祥物 土豆泥 搅碎机 面具/口罩 按摩 桅杆 地垫 斗牛士 比赛 火柴盒 衣料 床垫 陵墓 长裙 一餐 量杯 卷尺 肉类 肉丸 机械师 机械风扇 奖牌 媒体 医疗设备 医学图像 医务人员 医药箱 中世纪的 麦地那市 冥想 猫鼬 赛事 香瓜 纪念碑 菜单 美人鱼 网 肮脏 信使袋 金属 金属艺术家 金属探测器 计量器 中层楼 麦克风 显微镜 微波炉 午夜 里程碑 军装 牛奶 牛奶罐 奶茶 奶昔 磨坊 矿井 矿工 矿物质 矿泉水 迷你 微缩模型 面包车 部长 小型货车 薄荷 薄荷糖 镜子 小姐 投掷物 任务 槲寄生 混合 搅拌机 搅拌碗 混合物 护城河 电动踏板车 模型/模特 汽车模型 现代 现代大厦 潮湿 模具 模具 鼹鼠 君主 钱 监控器 和尚 猴子 活动扳手 黑白照片 独轮脚踏车 怪物卡车 月亮 月饼 月光 沼泽 驼鹿 拖把 助力车 早晨 晨雾 晨光 朝阳 砂浆 马赛克 清真寺 蚊子 藓类植物 汽车旅馆 蛾 母亲 主板 主题 动作 电动机 摩托车 摩托车 摩托车头盔 摩托车赛车手 骑摩托车的人 赛车运动 土堆 山 山地自行车 山地自行车员 山地自行车运动 山地大猩猩 山湖 山景观 山口 山路 山脉 山区河流 山雪 山间溪流 山景城 山村 登山者 登山包 鼠标/鼠 鼠标垫 捕鼠器 嘴 漱口水 移动 电影海报 电影票 割草机 mp3播放器 先生 泥 松饼 马克杯 桑树 覆盖物 骡子 直辖市 壁画 肌肉 肌肉车 博物馆 蘑菇 音乐 音乐节 音乐凳子 音乐工作室 音乐录影带表演者 音乐键盘 音乐家 贻贝 芥末 神话 烤干酪辣味玉米片 指甲油 指甲锉 保姆 餐巾 狭窄的 国旗 基督诞生的场景 自然历史博物馆 自然 自然保护区 导航 九夜节 海军 星云 脖子 围颈带/领口 项链 领口 花蜜 油桃 针状物 邻居 与某处邻近的地区 霓虹灯 霓虹灯 神经 巢 新年 新生的 纽芬兰 新婚 新闻 记者招待会 报摊 晚上 夜市 夜空 夜景 夜总会 床头柜 面条 鼻子 鼻羁 注解 笔记本 记事本 信纸 公告 数字图标 修女 护士 托儿所 养老院 螺母 胡桃夹子 橡木 橡树 桨 绿洲 烘干室 燕麦片 燕麦 方尖塔 观察塔 天文台 超越障碍训练场 海洋 章鱼 提供 办公室 办公大楼 办公椅 办公室隔间 办公桌 办公用品 办公室的窗户 军官 行政官员 石油 油灯 油画 石油钻台 秋葵 老照片 橄榄 橄榄油 橄榄树 煎蛋卷 洋葱 洋葱圈 蛋白石 开阔的/张开 开始 开幕式 歌剧 歌剧院 操作 手术室 操作 眼镜店 猩猩 橙子/橙色 橙汁 橙树 橘园 轨道 果园 乐池 兰花 订单 组织 折纸 点缀 鱼鹰 鸵鸟 水獭 外面的 露头 户外 厕所 电源插头 大纲 椭圆形 烤箱 整体 大衣 天桥 猫头鹰 牡蛎 橡皮环 包裹 包/包装/包裹 围场 警车 挂锁 肉菜饭 宝塔 疼痛 油漆刷 画家 佩斯利印花大手帕 宫殿 调色板 栅栏 棺罩 棕榈树 平底锅 煎饼 熊猫 面板 全景 三色堇 喘息 储藏室 裤子 连裤袜 木瓜 纸 纸袋 切纸机 纸灯笼 纸盘子 纸巾 平装书 压纸器 降落伞 游行 天堂 鹦鹉 护理人员 长尾小鹦鹉 滑翔伞 伞兵 羊皮纸 教区 公园 公园长椅 停车 停车场 停车费 停车标志 议会 欧芹/香菜 参与者 合作伙伴 帕特里奇 聚会 派对帽 通过 通道 存折 乘客 客船 旅客列车 百香果 护照 面食 粘贴 糕点 牧场 补丁 病人 图案/款式 人行道/硬路面 大帐篷 爪子 支付 付费电话 豌豆 和平 桃子 孔雀 山峰/尖顶 花生 花生酱 梨 珍珠 卵石 山核桃 行人 人行天桥 步行街 果皮 削皮器 小钉板 木质腿 鹈鹕 笔/围栏 点球 铅笔 铅笔盒 卷笔刀 铅笔裙 吊坠 钟摆 企鹅 半岛 锦标旗 便士 储蓄罐 牡丹 胡椒/辣椒 胡椒研磨机 胡椒子 意大利辣香肠 栖息/鲈鱼 表演 表演 表演舞台 香水 绿廊 波斯猫 柿子 个人护理 个人漂浮装置 害虫 宠物 宠物店 宠物店 花瓣 佩妮 教堂的长椅 野鸡 现象 哲学家 电话 电话簿 留声机 照片 照相亭 相框 摄影 物理学家 物理实验室 钢琴家 钢琴 选择 捡起 泡菜 野餐 野餐区 野餐篮 野餐桌 图片 相框 馅饼 鸽子 朝圣者 药片 枕头 飞行员 领航艇 别针 松树 松果 松林 松子 菠萝 乒乓球桌 乒乓球 粉色 一品脱的量 琵琶 管子 管碗 海盗 海盗旗 海盗船 阿月浑子 滑雪场 口袋里的面包 火龙果 斗牛犬 球场 大水罐 猪笼草 干草叉 披萨 披萨刀 比萨锅 披萨店 招牌 地方 餐具垫 格子 平原 示意图 行星 行星地球 厚木板 植物 种植园 种植 匾额 石膏 塑料 橡皮泥 高原 平台 白金 大浅盘 玩/演奏/运动 打羽毛球 打棒球 打篮球 玩台球 踢足球 玩乒乓球 打网球 打排球 选手/运动员 操场 剧场 扑克牌 下棋 打高尔夫球 打麻将 运动场 护栏 游戏室 广场 钳子 故事情节 犁 插头 插头帽 李子 水管工 卫生洁具 羽毛 夹板 口袋 怀表 随身小折刀 圆荚体 乐队指挥台 诗歌 一品红 指/朝向 指针 扑克卡 筹码 扑克表 杆/柱 臭猫 警察 警车 警犬 警察局 政治家 圆点 花粉 污染 马球 马球领 马球衬衫 石榴 波美拉尼亚的 雨披 池塘 马尾辫 贵宾犬 池 流行 流行艺术家 爆米花 教皇 罂粟 瓷 玄关 猪肉 粥 便携式电池 门户网站 投资组合 汽门 肖像 肖像会话 摆姿势拍照 负鼠 帖子 邮局 邮票 明信片 海报 海报页 锅/罐/陶盆 土豆 土豆片 土豆沙拉 布垫子 便壶 袋 家禽 英镑 倾泻 粉末 电源线 电源插头及插座 权力看 电站 练习 布拉格城堡 祈祷 牧师 首映 处方 显示 演讲 总统 新闻发布室 高压锅 椒盐卷饼 王子 公主 打印 打印页面 打印机 印刷 监狱 农产品/生产 产品 职业 专业的 教授 项目图片 投影屏幕 投影仪 毕业舞会 散步 螺旋桨 先知 建议 防护服 抗议 抗议者 出版 宣传画像 冰上曲棍球 布丁 水坑 泡芙 角嘴海雀 哈巴狗 拉 讲坛 脉冲 泵 南瓜 南瓜饼 南瓜种子 拳击吊袋 拳头猛击/穿孔 学生 紫色 推 轻轻一击 谜题 塔 金字塔 大蟒 二维码 鹌鹑 采石场 季度 石英 女王 油炸玉米粉饼 队列 乳蛋饼 被子 绗缝 引用 兔子 浣熊 比赛 赛道 水沟/跑道 赛车 球拍 雷达 散热器 广播 木筏/橡皮艇 布娃娃 栏杆/铁轨 轨道车 铁道 铁路桥梁 轨道线 火车站 雨 雨靴 彩虹 虹鳟鱼 雨衣 热带雨林 多雨的 葡萄干 耙子 公羊 斜坡 油菜籽 快速 说唱歌手 树莓 老鼠 棘轮 乌鸦 峡谷 雷 剃须刀 锋利的 阅读 阅读材料 钻孔器 后面 尾灯 后视图 后视镜 收据 收到 接待 配方 记录 唱片制作人 记录器/竖笛 录音室 娱乐室 休闲车 矩形 回收 回收站 红色 红地毯 红旗 红熊猫 红酒 红木 芦苇 礁石 卷轴 裁判 倒影 倒影 反射器 注册 控制 驯鹿 放松 释放 救援 宗教 宗教的 享受 保持 改造 遥控器 移除 修复 维修店 爬行动物 救援 救助者 研究 研究员 储层 住宅 居民区 树脂 度假胜地 度假小镇 餐厅的厨房 餐厅的露台 厕所 零售 寻回犬 制动火箭 揭示 犀牛 杜鹃 肋骨 丝带 大米 电饭煲 稻田 骑/搭乘 脊 骑马 步枪 边缘 环/戒指 暴乱 涟漪 上升 高层建筑 河 河岸 河船 河谷 河床 路 路标 公路旅行 路边 烤鸡 长袍 罗宾 机器人 石头 岩石拱 摇滚艺术家 摇滚乐队 攀岩者 攀岩 摇滚音乐会 岩石表面 岩层 摇滚歌手 火箭 摇椅 岩石 啮齿动物 牛仔竞技表演 竞技舞台 罗伊 狍子 辊 过山车 轮式溜冰鞋 溜冰鞋 擀面杖 浪漫 浪漫的 屋顶 屋顶花园 房间 房间分频器 根 根啤酒 绳索桥 念珠 玫瑰 迷迭香 玫瑰色的云 罗特韦尔犬 圆桌 路由器 行 罗文 皇家 橡皮图章 废墟 魔方 红宝石 莱夫 橄榄球 橄榄球 橄榄球运动员 毁坏 尺 朗姆酒 跑 跑步者 跑步鞋 农村的 锈 乡村的 黑麦 袋 鞍 鞍囊 旅行 安全 安全背心 圣人 帆 帆船 航行 水手 松鼠猴 缘故 沙拉 沙拉碗 火蜥蜴 意大利蒜味腊肠 出售 三文鱼 沙龙 萨尔萨舞 盐 盐和胡椒瓶 盐湖 盐沼 盐瓶 敬礼 萨莫耶德人 武士 沙子 沙洲 砂箱 沙堡 沙雕 凉鞋 三明治 卫生巾 圣诞老人 蓝宝石 沙丁鱼 莎丽 生鱼片 沙爹 书包 卫星 缎 酱汁 碟子 桑拿 香肠 稀树大草原 锯 锯木架 萨克斯管 萨克斯手 脚手架 秤/标尺 比例模型 扇贝 疤痕 稻草人 围巾 场景 风景 雪纳瑞犬 学校 校车 校服 校舍 纵帆船 科学 科幻电影 科学博物馆 科学家 剪刀 壁灯 司康饼 勺子 踏板车/摩托车 分数 记分板 蝎子 童子军 炒蛋 废弃 刮板 刮伤 屏幕 纱门 截图 螺杆 螺丝刀 长卷纸/卷轴 擦洗 硬毛刷 雕塑家 雕塑 海洞穴 海冰 海狮 海龟 海胆 尖吻鲈 海底 海鸟 海鲜 海马 海豹 海景 海贝 海滨度假胜地 季节 座位 安全带 海藻 秘书 安全 小轿车 看到 种子 跷跷板 赛格威 自拍 出售 研讨会 感觉 传感器 服务器 服务器机房 服务 集 缝纫机 影子 摇 瓶 洗发水 形状 分享 鲨鱼 卷笔刀 记号笔 剃须刀 剃须膏 披肩/围巾 剪切 剪刀 羊 床单 乐谱 架子 贝壳 贝类 避难所 搁置 牧羊人 果子露 柴犬 发光 航运 集装箱 海难 船厂 衬衫 赤膊的 浅滩 鞋 鞋盒 鞋店 鞋楦 射击 得分篮球后卫 商店橱窗 门面 购物者 购物 购物袋 购物篮 购物车 购物中心 购物街 海岸 海岸线 短的 短发 短裤 小酒杯 散弹枪 肩膀 单肩包 铲 陈列柜 淋浴 浴帽 浴帘 淋浴门 淋浴头 碎纸机 泼妇 虾 神社 灌木 快门 暹罗猫 西伯利亚 兄弟姐妹 侧面 边柜 配菜 边车 边线 壁板 标志 指示牌 信号 签名 丝绸 丝袜 筒仓 银 银牌 银器 唱歌 烧焦 歌手 水槽 啜 坐/放置/坐落 坐着 滑板公园 滑板 滑板者 溜冰者 溜冰场 骨架 草图 串串 滑雪 滑雪靴 滑雪设备 滑雪服 滑雪缆车 滑雪杖 滑雪胜地 滑雪板 滑雪 滑雪鞋 皮肤 头骨 无边便帽 天空 天空塔 天窗 天际线 摩天大楼 激流回旋 石板 雪橇 睡眠 睡袋 睡衣 袖子 片 滑动 滑块 吊索 坡 投币口 老虎机 树懒 慢炖锅 鼻涕虫 贫民窟 气味 微笑 烟雾/抽烟 零食 蜗牛 蛇 鲷鱼 快照 通气管 鼻子 雪 雪豹 雪山 雪球 单板滑雪者 雪原 雪花 雪人 雪地摩托 雪犁 雪鞋 雪 肥皂 肥皂泡 给皂器 足球守门员 社会名流 短袜 插座 苏打水 垒球 软件 太阳能电池阵列 士兵 独奏 解决方案 宽边帽 歌曲 声音 汤 汤碗 汤匙 酸奶油 纪念品 豆浆 水疗中心 空间 航天飞机 空间站 宇宙飞船 意大利面 横跨 扳手 火花 闪耀 烟火 起泡葡萄酒 麻雀 抹刀 扬声器 观众 会话框 速度限制 限速标志 快艇 车速表 球 香料 调料架 蜘蛛 蜘蛛网 扣球 旋转 菠菜 尖塔 飞溅 海绵 勺子 体育协会 运动器材 运动团队 体育球 体育器材 运动会 运动服装 点 喷雾 伸展 春天 春卷 撒 洒水器 发芽 云杉 云杉森林 队 广场 南瓜 蹲 挤 鱿鱼 松鼠 水枪 刺 稳定的 (码放整齐的)一叠 体育场 工作人员 舞台 舞台灯 驿马车 弄脏 不锈钢 楼梯 楼梯 楼梯间 摊位/小隔间 种马 站/矗立/摊位 站 主食 订书机 星星 盯着 海星 杨桃 燕八哥 州立公园 公立学校 车站 固定自行车 文具 雕像 牛排 牛排刀 蒸汽 蒸汽机 蒸汽机车 蒸汽火车 馒头 钢 方向盘 (花草的)茎 模版 梯凳 立体声 听诊器 炖 戳/条状物 竹节虫 贴纸 静物画 高跷 黄貂鱼 搅拌 搅拌器 镫 缝 股票 长筒袜 腹部 石头建筑 石雕 石屋 石磨 凳子 停止 停在 红灯 停车标志 秒表 红绿灯 存储箱 储藏室 罐/蓄水池 商店 店面 鹳 风暴 暴风云 狂风暴雨的 炉子 扑克 跨骑 过滤器 海峡 带 稻草/吸管 草帽 草莓 溪流 街头艺术 街头艺术家 街角 流浪狗 街头食品 路灯 街市场 街头摄影 街景 路标 街头小贩 拉伸 担架 罢工 前锋 细绳 芝士条 带子 条纹 漫步 结构 工作室 影棚拍摄 材料 填充玩具动物 毛绒玩具 馅 树桩 惊人的 特技 佛塔 风格 手写笔 潜艇 潜艇形大三明治 海底水 郊区 地铁 地铁站 低音炮 多肉 绒面革 糖 糖碗 甘蔗 方糖 西装 套房 夏天 夏天傍晚 峰顶 太阳 太阳帽 日光浴 周日 日晷 向日葵 向日葵田 葵花籽 太阳镜 晴天 日出 日落 遮阳伞 阳光 超级碗 跑车 超级英雄 超市 超市货架 超模 支持者 冲浪 表面 冲浪板 冲浪者 外科医生 外科手术 环绕 寿司 寿司吧 背带裤 悬架 吊桥 越野车 燕子 燕尾蝶 沼泽 天鹅 天鹅游艇 运动裤 防汗带 毛衣 运动衫 甜的 红薯 游泳 泳帽 游泳者 游泳洞 游泳池 摆动 平转桥 秋千 漩涡 开关 转椅 剑 旗鱼 象征 对称 犹太教堂 注射器 糖浆 系统 t恤 t恤 塔巴斯科辣椒酱 虎斑 乒乓球拍 桌面 桌布 平板电脑 餐具 转速表 拦截 墨西哥煎玉米卷 跆拳道 太极 尾巴 裁缝 拍/拿 起飞 说话/交谈/演讲 手鼓 棕褐色 橘子 胶带/磁带/终点线 挂毯 沥青碎石路面 芋头 篷布 果馅饼 流苏 味道 榻榻米 纹身 纹身艺术家 酒馆 茶 茶包 茶话会 茶园 茶壶 茶具 教 老师 茶杯 水鸭 团队合影 团队介绍 眼泪/撕裂/划破 技术员 技术 泰迪熊 T字形物 青少年 电线杆 变焦镜头 望远镜 电视 电视摄像机 电视室 电视演播室 温度 寺庙 天妇罗 网球 网球场 网球比赛 网球网 网球运动员 网球拍 帐篷 龙舌兰酒 终端/航站楼 阳台 地形 玻璃容器 领土 测试 测试赛 试管 文本 短信 纺织 纹理 感恩节 感恩节晚餐 剧院 戏剧演员 治疗 温度计 热水瓶 暖瓶 恒温器 灌木丛 顶针 东西 思考 蓟 宝座 金銮殿 扔 抱枕 雷 雷雨 百里香 皇冠 记号 票 售票亭 潮池 领带 老虎 紧 瓦 瓷砖地板 瓦屋顶 瓷砖墙 锡 锡纸 箔 提拉米苏 轮胎 纸巾 烤面包 烤面包机 烟草 烟斗 学步的小孩 脚趾 豆腐 马桶 马桶座圈 化妆包 东京铁塔 番茄 番茄酱 番茄汤 墓 钳子 钳子 工具 工具箱 牙刷 牙膏 牙签 修剪成形的花园 配料 火炬/光源 龙卷风 玉米粉圆饼 乌龟 大手提袋 图腾柱 龙猫 巨嘴鸟 触摸 触地 旅行 旅游巴士 导游 游客 旅游景点 锦标赛 拖车 毛巾 毛巾杆 大厦 塔桥 小镇 城镇广场 玩具 玩具车 玩具枪 玩具店 跑道 拖拉机 贸易 传统 传统的 交通 锥形交通路标 交通拥堵 交通堵塞 交通标志 小道 预告片 拖车 火车 火车桥 火车车厢 火车内部 火车轨道 火车窗口 教练 训练 训练长椅 训练场 电车/手推车 蹦床 变形金刚 透明度 旅行 托盘/碟子 跑步机 美食 树 树枝 林场 树蛙 树屋 树根 树干 试验 三角形 铁人三项 部落 支流 戏法/特技 三轮车 修剪 三人组 三脚架 长号 部队 奖杯 奖杯 热带 鳟鱼 卡车 卡车司机 浴缸 管子 拖船 郁金香 金枪鱼 苔原 隧道 涡轮 火鸡 转动 芜菁 绿松石 炮塔 乌龟 獠牙 电视演员 电视柜 电视剧 电视节目类型 电视名人 电视节目 情景喜剧 电视塔 枝条 黄昏 双胞胎 麻线 扭 类型 键入 打字机 尤克里里 奥特曼 伞 内衣 水下 独角兽 制服 宇宙 大学 向上 城市 尿壶 瓮 使用 用具 杂物间 吸尘器/真空 谷 阀门 吸血鬼 货车 香草 虚荣 种类 花瓶/瓶 金库 矢量卡通插图 矢量图标 蔬菜 菜园 蔬菜市场 植被 车辆 面纱 静脉 天鹅绒 自动售货机 小贩 通风孔 胡蜂属 船 背心 兽医 经验丰富的 兽医办公室 高架桥 视频 摄像机 电子游戏 录像带 视镜 守夜 别墅 村庄 藤蔓 醋 葡萄园 暴力 紫罗兰色 小提琴 小提琴家 中提琴演奏者 愿景 遮阳板 伏特加 火山 排球 排球场 排球运动员 志愿者 航行 秃鹰 华夫饼干 华夫饼机 货车 马车车轮 腰 服务员 候机室 等候室 走 步行 手杖 挂钟 壁纸 核桃 海象 战争 仓库 温暖的 警告标志 战士 军舰 疣猪 洗 洗衣机/垫圈 洗 洗衣机 黄蜂 浪费 废物容器 手表 水 水鸟 水牛 水冷却器 水滴 水景 热水器 水位 荷花 水上乐园 水管 净水器 滑水板 水上运动 水面 水塔 水彩 水彩插图 水彩画 瀑布 喷壶 水印叠加图章 西瓜 防水外套 水路 波浪 蜡 武器 穿着 天气 叶片 网 摄像头 婚礼 结婚戒指 婚礼花束 结婚蛋糕 新婚夫妇 婚礼请柬 婚礼派对 婚纱照 婚礼摄影师 婚纱摄影 婚宴 楔 杂草 重量 体重秤 焊接工 井 西餐 西餐厅 湿 吧台 潜水衣 湿地 潜水服 鲸鱼 鲸鲨 小麦 麦田 车轮 轮椅 后轮支撑车技 生奶油 搅拌器 胡须 威士忌 哨子 白色 白宫 白葡萄酒 白板 便门 宽的 挥动 假发 Wii Wii手柄 荒野 角马 野火 野花 野生动物 柳树 风 风铃 风电场 风力涡轮机 风车 窗户 窗台花盆箱 橱窗展示 窗框 纱窗 靠窗的座位 窗台 雨刮器 挡风玻璃 有风的 酒瓶 冷酒器 酒柜 酒窖 酒杯 酒架 品酒 酒庄 翅膀 冬天 冬瓜 冬天的早晨 冬季场景 冬季运动 冬季风暴 电线 紫藤 巫婆 女巫帽子 炒锅 狼 女人 木头 林鸳鸯 木地板 木墙 烧木炉 木匙 林地 啄木鸟 木工刨 羊毛 工作 练习卡 工作台 工人 工作场所 车间 世界 蠕虫 敬拜 伤口 包 裹身裙 包装纸 搏斗 摔跤手 皱纹 腕带 写 作家 手写/字迹 毛笔 写字桌 游艇 牦牛 院子 黄色 瑜伽 瑜伽垫 酸奶 轭 蛋黄 青年 青年旅馆 蒙古包 斑马 斑马线 禅意花园 拉链 拉链 僵尸 粽子 动物园 ================================================ FILE: ram/data/ram_tag_list_threshold.txt ================================================ 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.71 0.75 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.9 0.65 0.9 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.61 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.7 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.82 0.8 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.85 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.77 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.89 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.78 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.9 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.9 0.65 0.83 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.79 0.65 0.65 0.8 0.65 0.65 0.65 0.89 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.86 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.79 0.65 0.63 0.65 0.87 0.8 0.46 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.9 0.65 0.65 0.9 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.8 0.65 0.8 0.8 0.8 0.65 0.65 0.84 0.65 0.65 0.79 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.81 0.65 0.8 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.87 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.83 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.77 0.87 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.85 0.65 0.68 0.65 0.8 0.65 0.65 0.75 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.8 0.8 0.8 0.79 0.65 0.85 0.65 0.65 0.65 0.9 0.65 0.89 0.8 0.65 0.65 0.65 0.76 0.65 0.65 0.65 0.65 0.65 0.65 1 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.9 0.65 0.89 0.7 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.71 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.8 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.65 0.8 0.65 0.8 0.8 0.9 0.65 0.85 0.8 0.8 0.8 0.9 0.65 0.65 0.8 0.65 0.65 0.65 0.75 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.63 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.88 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.71 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.9 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.71 0.65 0.8 0.76 0.85 0.8 0.65 0.65 0.8 0.65 0.79 0.65 0.75 0.65 0.8 0.65 0.86 0.65 0.65 0.9 0.9 0.65 0.65 0.65 0.65 0.65 0.73 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.9 0.65 0.85 0.65 0.65 0.65 0.65 0.8 0.75 0.65 0.65 0.65 0.65 0.8 0.85 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.77 0.65 0.65 0.65 0.65 0.65 0.86 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.6 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.74 0.65 0.65 0.67 0.65 0.65 0.8 0.65 0.65 0.85 0.65 0.8 0.65 0.65 0.84 0.8 0.8 0.8 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.9 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.89 0.65 0.65 0.65 0.83 0.65 0.65 0.65 0.65 0.6 0.65 0.8 0.8 0.8 0.65 0.65 0.89 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.8 0.65 0.77 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.87 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.74 0.65 0.65 0.66 0.89 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.84 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.88 0.65 0.65 0.8 0.65 0.65 0.7 0.65 0.65 0.65 0.9 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.82 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.9 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.8 0.8 0.65 0.65 0.65 0.75 0.65 0.7 0.9 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.88 0.65 0.65 1 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.8 0.65 0.65 0.8 0.8 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.8 0.8 0.65 0.8 0.65 0.65 0.71 0.65 0.65 0.65 0.79 0.65 0.65 0.65 0.65 0.65 0.89 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.9 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.88 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.82 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.9 0.65 0.65 0.88 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.89 0.65 0.65 0.8 0.65 0.65 0.65 0.87 0.65 0.66 0.65 0.84 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.84 0.65 0.65 0.65 0.65 0.65 0.9 0.8 0.65 0.65 0.65 0.65 0.65 0.5 0.65 0.64 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.81 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.8 0.8 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.65 0.84 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.8 0.65 0.85 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.73 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.86 0.65 0.65 0.65 0.65 0.87 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.8 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.82 0.8 0.65 0.65 0.65 0.84 0.9 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.64 0.65 0.65 0.65 0.8 0.8 0.87 0.65 0.65 0.78 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.8 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.9 0.65 0.65 0.8 0.65 0.85 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.74 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.88 0.65 0.65 0.65 0.65 0.65 0.65 0.83 0.89 0.89 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.86 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.85 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.9 0.65 0.65 0.9 0.65 0.65 0.65 0.9 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.86 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.87 0.8 0.84 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.8 0.81 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.7 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.82 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.87 0.65 0.9 0.8 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.8 0.7 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.85 0.65 0.65 0.65 0.65 0.65 0.73 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.89 0.8 0.65 0.9 0.65 1 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.89 0.89 0.65 0.65 0.65 0.8 0.75 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.8 0.8 0.65 0.65 0.88 0.65 0.8 0.65 0.65 0.8 0.85 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.9 0.57 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.8 0.8 0.79 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.89 0.8 0.65 0.8 0.65 0.8 0.65 0.81 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.89 0.65 0.65 0.65 0.65 0.65 0.89 0.84 0.65 0.65 0.65 0.65 0.8 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.89 0.65 0.8 0.83 0.65 0.65 0.8 0.65 0.65 0.72 0.65 0.65 0.65 0.8 0.8 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 1 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.9 0.65 0.65 0.89 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.69 0.8 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.71 0.65 0.65 0.65 0.88 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.85 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.87 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.9 0.8 0.9 0.65 0.8 0.8 0.65 0.65 0.8 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.8 0.65 0.85 0.65 0.65 0.8 0.65 0.89 0.65 0.65 0.9 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.86 0.65 0.77 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.8 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.9 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.8 0.8 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.89 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.75 0.8 0.65 0.8 0.88 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.88 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.82 0.65 0.65 0.8 0.65 0.8 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.83 0.65 0.65 0.92 0.89 0.8 0.8 0.65 0.65 0.65 0.65 0.75 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.85 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.87 0.65 0.79 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.83 0.8 0.65 0.65 0.8 0.8 0.65 0.7 0.65 0.65 0.8 0.65 0.65 0.8 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.9 0.8 0.65 0.65 0.65 0.65 0.7 0.65 0.65 0.65 0.65 0.65 0.65 0.87 0.65 0.65 0.65 0.65 0.8 0.82 0.65 0.8 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 1 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.64 0.65 0.65 0.63 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.76 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.8 0.65 0.75 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.87 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.82 0.65 0.65 0.65 0.65 0.65 0.8 0.89 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.9 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.9 0.8 0.65 0.73 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.86 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.9 0.65 0.9 0.65 0.65 0.65 0.65 0.86 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.86 0.65 0.8 0.8 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.69 0.65 0.65 0.65 0.65 0.65 0.88 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.72 0.65 0.65 0.8 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.9 0.9 0.8 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.45 0.8 0.65 0.88 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.8 0.51 0.65 0.65 0.8 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.66 0.65 0.8 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.81 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.75 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.66 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.9 0.8 0.65 0.85 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.81 0.65 0.65 0.65 0.65 0.65 0.65 0.89 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.79 0.75 0.65 0.65 0.8 0.65 0.67 0.8 0.8 0.86 0.65 0.65 0.65 0.65 0.65 0.65 0.81 0.8 0.65 0.65 0.9 0.65 0.79 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.77 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.8 0.65 0.74 0.65 0.65 0.65 0.65 0.65 0.65 0.6 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.89 0.8 0.65 0.65 0.88 0.65 0.65 0.65 0.9 0.75 0.65 0.65 0.65 0.8 0.6 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.84 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.8 0.8 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.85 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.63 0.65 0.65 0.65 0.7 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.9 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.84 0.65 0.65 0.8 0.65 0.81 0.8 0.8 0.8 0.82 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.8 0.65 0.88 0.65 0.8 0.65 0.7 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 1 0.8 0.8 0.65 0.65 0.65 0.8 0.8 0.8 0.65 0.74 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.85 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.9 0.86 0.8 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.64 0.65 0.65 0.8 0.8 0.65 0.87 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.87 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.7 0.65 0.65 0.8 0.65 0.65 0.75 0.65 0.65 0.65 0.65 0.65 0.65 0.85 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.71 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.73 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.8 0.65 0.86 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.75 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.88 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.81 0.65 0.65 0.8 0.65 0.65 0.9 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.9 0.65 0.65 0.65 0.65 0.7 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.77 0.65 0.65 0.65 0.65 0.65 0.85 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.87 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.57 0.65 0.65 0.8 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.76 1 0.8 0.65 0.65 0.58 0.8 0.65 0.65 0.65 0.65 0.65 0.8 1 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.9 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.87 0.8 0.9 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.87 0.68 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.99 0.8 0.77 0.65 0.9 0.65 0.65 0.88 0.65 0.65 0.65 0.65 0.9 0.65 0.88 0.65 0.65 0.65 0.65 0.65 0.65 0.89 0.65 0.65 0.8 0.8 0.65 0.7 0.65 0.65 0.8 0.9 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.8 0.65 0.77 0.65 0.65 0.65 0.65 0.79 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.85 0.65 0.65 0.65 0.65 0.65 0.65 0.52 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.86 0.65 0.65 0.8 0.56 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.72 0.65 0.65 0.65 0.8 0.8 0.65 0.9 0.65 0.65 0.8 0.65 0.8 0.6 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.88 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.89 0.85 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.87 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.75 0.65 0.65 0.65 0.65 0.54 1 0.65 0.65 0.75 0.65 0.75 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.9 0.62 0.65 0.65 0.65 0.65 0.86 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.82 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.9 0.74 0.8 0.65 0.8 0.8 0.7 0.65 0.65 0.65 0.89 0.65 0.65 0.8 0.8 0.8 0.8 0.65 0.8 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.8 0.8 0.84 0.8 0.65 0.65 0.8 0.75 0.65 0.65 0.65 0.89 0.65 0.65 0.65 0.65 0.82 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.84 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.8 0.65 0.7 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.74 0.65 0.8 0.65 0.65 0.65 0.9 0.65 0.65 0.65 0.65 0.85 0.65 0.9 0.9 0.65 0.65 0.65 0.63 0.82 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.7 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.65 0.74 0.9 0.65 0.8 0.65 0.65 0.58 0.65 0.65 0.65 0.65 0.65 0.65 0.89 0.75 0.65 0.65 0.8 0.65 0.65 0.88 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.89 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.8 0.65 0.65 0.8 0.8 0.65 0.65 0.87 0.65 0.65 0.65 0.8 0.65 0.64 0.65 0.65 0.65 0.8 0.87 0.65 0.65 0.8 0.9 0.65 0.65 0.65 0.65 0.8 0.8 0.65 0.89 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.83 0.65 0.65 0.8 0.65 0.9 0.65 0.8 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.78 0.65 0.8 0.65 0.9 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.9 0.65 0.88 0.8 0.65 0.65 0.65 0.81 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.77 0.65 0.65 0.65 0.8 0.8 0.8 0.8 0.65 0.65 0.65 1 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.8 0.85 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.88 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.8 0.8 0.8 0.65 0.65 0.65 0.65 0.68 0.65 0.65 0.65 0.65 0.65 0.65 0.89 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.9 0.65 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.81 0.65 0.65 0.65 0.8 0.85 0.65 0.77 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.8 0.8 0.9 0.65 0.65 0.89 0.65 0.65 0.8 0.65 0.65 0.8 0.8 0.65 0.65 0.65 0.88 0.8 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.82 0.65 0.8 0.74 0.65 0.65 0.65 0.65 0.65 0.65 0.85 0.65 0.65 0.85 0.65 0.65 0.65 0.65 0.7 0.7 0.8 0.65 0.65 0.65 0.65 0.87 0.8 0.65 0.65 0.65 0.89 0.85 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.7 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.9 0.8 0.8 0.65 0.66 0.57 0.65 0.65 0.65 0.49 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.8 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.65 0.65 0.8 0.65 0.65 0.65 0.65 0.8 0.65 0.8 0.8 0.86 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.89 0.65 0.65 0.65 0.65 0.65 0.65 0.76 ================================================ FILE: ram/data/tag_list.txt ================================================ tennis bear cub observatory bicycle hillside judge watercolor illustration granite lobster livery stone ceramic ranch cloth smile building tattoo cricketer cheek pear source winter surface spray ceremony magic curve container fair medicine baby tennis racquet ornament bamboo duckling song safari team presentation daffodil cross toothpaste shield fashion model capsule map creek glass house glass plate siding corner water buffalo bison figure skater diploma tire race cable car brain gas stove soap bubble palette snowboard school child trench coat monk fiber kitchen window sunglass coffee security strawberry penguin tree root loaf engagement ring lamb vector cartoon illustration sandwich mountain village shape charm fiction knot greenhouse sushi text disaster trophy gang strap soccer game cardinal tee turtle water surface grassland dolphin store dirt iceberg pergola farmer market publicity portrait tote bag teenage girl view mirror session commuter dressing room tricycle christmas ball headlight police armchair chart yacht saw printer rock band gingerbread house tag table lamp hockey game slope font wicker basket jewelry quarter software weapon pin worship painter goal morning light bike baseball bat elevator cuisine sausage stunt wrestler statue landing pillar willow tree sea wave chicken peanut muscle bob tv genre bathroom window radish textile pelican marketplace crest elevation map gift parish traffic light campfire fog award winner beach ball mat white house plaster moped football team solution bicyclist bit playground darkness cake maple leave mold cracker blueberry rubble container ship pedestrian bridge snail parrot form circuit highlight pickup truck koala rain system weather raincoat soccer team windshield thunderstorm mike bird house bridge grandfather restroom animation wilderness clown banana brown braid dining room kindergarten launch event purple school stairwell brooch movie poster image mountain river shelf wicket headboard buddha flower field dugout cd bald eagle lagoon seaweed agriculture emergency service maple tree parachute continent amusement park remote bun tackle hospital garage door birthday party friendship go mausoleum jeep raccoon step ice hockey team cigarette lace dress forest floor mall captain milk golf course meal picnic table sail volleyball canal terrace computer desk caravan hotel cheerleader nurse museum marsh fox plateau night twin letter logo autumn tree powder convention creature lighthouse shop window jacket stork taxi trade blackboard olive road sign resort snowflake cemetery travel evening dress picnic drink winter morning football player snack boxing glove dinner party airline swing port wheelbarrow bathroom sink sweater ambulance gear oil wii controller array home office car show mixture profession tree frog square facility coral reef sea wall pizza exhibit demolition trout ring coffee shop bracelet bean lip fencing landscape sitting package metal bust king hair window seat wildlife trunk greenery stencil fire hydrant bridesmaid plaza alps tower bridge crop top crossing cinema pedestrian crossing family shopping cart stomach church building screen door skater soccer field kettle mussel raindrop candy cane water lily flower girl desert enclosure christmas light kitchen caterpillar plaid bath bush mud ballet knee adult raft sea view cactus office chair overall rim scaffolding pig cover poster page sprinkle chandelier algae traffic surfboard book filming flash mansion camouflage trouser ticket weed cab trench elephant huddle sphere christmas decoration city launch doll christmas ornament fabric bikini biplane breakfast neighbourhood race track foliage avocado school bus footwear highway ocean view art vector illustration wall clock curtain teenager kitchen area robot tusk lounge chair beam paddle camel lid world map city view newlywed cargo ship yellow exhibition bend novel wool ontario bread campus coastline cutting board booth table top carpet beach chair workout street food fun costumer film designer gadget artist fishing village builder violinist iphone spider web traffic sign ruin rescue clipboard seal film director paw nursery intersection tomato sauce taste paddy field christmas tree wave stool watering can rug daytime subway station craft pine forest black planet motif christmas market glass window college wheat damage rectangle picture frame chess guest room street corner religion seed puzzle freeway beauty ocean watch mother garage quote dj supporter hip hop artist muffin eiffel tower cash firefighter cauliflower bunker sled manicure shark stall jungle family home tour bus chimney touchdown roundabout coyote street scene tank wedding dress mantle bedroom window coconut chapel goat living space rock wall polka dot railway mandala mango lesson mountain landscape team photo bookshelf meter bulldog evening sun stick card pink fish pond paint pill cart pea van album football college game mountain pass doughnut ski slope match official shadow organ celebration coin log cabin firework display present twig chef confetti footpath tour ponytail artwork race car club season hose pencil aircraft rock formation wardrobe participant politician engineer peace filter sailing boat water bottle service dog poodle loki statesman sleeping bag outskirt clock factory oak tree physician color room stairway company lady graph faucet tablecloth subway train chocolate chip cookie headquarters screw goggle halloween city street swirl cord forward bone bedding archway wig lobby mask attic kitchen table skylight fire exit oil painting passenger meditation salmon fedora rubber stamp orange juice arch scientist stroll manhattan float baseball uniform circle church decker bus competitor zoo basketball team tourist daughter silverware ceiling fan birth vase jack mushroom spiral cage limb salad ad control earth party bolt tractor barley wedding photo hawk warehouse vegetable garden chocolate cake cabbage floor window baby shower magnifying glass table stethoscope reading mission croissant gift box rocket forest road cooking suite hill country motorcycle baseball player angle drug sport association championship family portrait florist softball egret office plywood jockey mosque brunch beanie office building pattern calendar indoor pepper ledge trail fuel laptop computer tennis shoe deck chair guitarist barn surgery cartoon illustration nebula railroad mountain goat goose car door cheer liquid hardwood floor pathway acorn gull airliner couch lake house spaghetti promenade collection garden bank robin tennis ball peony gymnast lavender deck test riverside rapper domino bride mouse basil wedding couple ocean wave arm kitchen floor grove family member backyard raspberry forest fire officer hibiscus canyon composer signature olive oil hibiscus flower rose vector icon sunrise horseback motor scooter office worker tradition ingredient washing machine lighting bagel sailboat policeman mare graphic halloween pumpkin stock pilot education team body horse kimono bazaar bag recording studio parsley entrance denim vet horse farm charcoal architecture glass vase puppy estuary television show host city bus shoulder beast balance golfer roadside denim jacket stone wall counter top app icon toast head coach ham warrior gem refrigerator snowman construction worker coal website morning fog mustard human owl puppy dog piggy bank vegetation pirate action film marshmallow thanksgiving business disease signage greeting skate park tile mouth spinach vacation leader shrine walker science fiction film bill rabbit motor boat bar radio barge tail chainsaw gallery rainbow pasta padlock web pastry ink reef school uniform shawl treasure peach dinner table injury harbor witch car dealership litter gesture documentary marriage sea shell priest dome kit icon seaside bucket entertainment stable hat puddle sock shopper technology harbour orbit antler tube flag waving cook tight commander farmland switch hiker wedding ceremony award ceremony champion chopstick farmhouse performer spike accident cruise ship passenger train attraction entertainer rear view sidewalk parade racing plane ritual peacock pocket plum drop carrot floor sunset troop architect coffee table dust outline leather charity event heat whale laundry coconut tree crosswalk pony ant pipe string coat angel beef church tower dish pitch cupboard thermometer dirt field fireworks minute cane pajama flower garden autumn trash can dachshund banana tree tray moose roadway carnival antenna pole castle wall ram cattle hay cookie swimmer baseball team strait hedge jet fire pit octopus calf cube opera cardboard box tiara kitchen sink prairie bowl galaxy straw hat linen ski resort stitch street lamp motorist icicle stain flora drain kitchen cabinet decor bouquet pound interior design nail polish figurine tomb disc twist blouse ribbon figure burger cork soccer goalkeeper train bridge drinking water dew baker storm cloud tarmac tv drama sponge magnet sailor entry swan exercise sloth jewel scuba diver bite cat tree tent can tennis match ecosystem picket fence palm train car frying pan rally tablet pc reindeer image wolf chin conservatory flood water cityscape beach sand car park pavement farm field swimming winter storm stem pillow inning gorilla desk avenue fern money pearl train station skillet nap barber library freezer label rainforest parking sign mirror wing noodle press room sculpture tablet viewer prayer mini mechanic laugh rice field hand mustache mountain road catwalk conference cape installation musician stream machine speech crocodile soccer match town square passport post box point stone building motorway mix dentist businessperson happiness boat vineyard treadmill glass wall water droplet coffee mug graduate sunflower parliament shepherd movie wine orchard tulip motherboard cup broom spot drawing polo shirt graduation film producer moonlight glow film format t shirt rock face sword clinic festival day meadow staple pupil training ground rider flower foal wharf foot bridge shooting top mast police car robe wedding bouquet stop sign birthday cake glitter butter scooter tundra superhero pocket watch inscription youngster fruit tree movie poster engine foundation motorcyclist take woman antelope country artist road trip typewriter tuxedo brand pine bathroom paradise texture balloon dining table home computer screen actor clip tv tower panorama summit cat plot eagle dancer pup studio shot tear bird bath classroom bookstore city wall tv programme blade easel buttercream sweet designer diamond handshake herb corn field seafront concrete street artist gas stamp window display paper note pint quarry research fixture manager soil leopard board game ladder stop light island ramp football match icing drill currency summer evening topping pyramid pomegranate cell ivy squad scenery computer locomotive surf mascot dune path duck twilight wire bow tie strike cormorant car wash crane market philosopher alarm clock camera birch greeting card plain clay donut lock moth laboratory fan violin jazz fusion artist mountain biker terrain magazine pickup comedy film smartphone film bed microwave oven tournament lawn car window alligator screen jetty shopping bag landscape view cabinetry friendly match thing petal shopping center transport ballet dancer shoreline princess car seat parking meter green vodka band rock costume warning sign strip plaque wheelchair headband ginger dice media hairdresser press living room stove player cherry workshop carving embroidery doodle adventure rugby player monument brush marker loft postcard collage ball professor dresser gig festival blackbird makeup artist video camera sticker peak wildflower santa hat rodeo wedding photographer guy staff waterfall operation defender falcon haze individual gentleman greyhound rocking chair rice garbage platter chocolate splash business suit cheetah valley maze trampoline garland slalom unicorn tree stump painting romance fight alcohol ghost fondant spa shutter death demonstration cotton pier flea market history savannah fist aisle crew jug pose anchor teapot boat house business team tripod bee pebble mattress canvas hallway campaign pod lake district article white sofa honey marathon pancake tourist attraction wedding gown battle shelving sea sheet music pie yarn construction site flyer tie star lettuce martial artist dart straw reflection conference room temperature rugby mosquito physicist rock climber crash backdrop toilet seat sand castle water park toy car waste luxury hangar rv tree trunk board gold project picture cap cottage relief attire microscope battery roll line parking garage crystal broadcasting brick wall lab flooring meeting 3d cg rendering desktop computer cowboy sailing ship junction hairstyle homework profile model flower pot street light salt lake maple space blizzard throw zebras brochure constellation beak kilt pond blue sky sneaker sand dune morning sun almond grill curl basketball girl game chameleon toilet bowl prince keyboard queen computer monitor writing crown basilica kiss house parking football competition shell sport equipment comedy baboon vendor rise building wrap food truck cat bed rickshaw flare teal nectar eclipse vehicle steam locomotive gorge cow christmas card demonstrator memorial towel jewellery train frisbee baseball game fur afternoon sun community sparkler bandage firework dollar pasture video bus tree house seashore field hamburger souvenir hedgehog worm pine cone osprey dinosaur vegetable junk poster army winger bundle stage growth wedding party service blanket ruler eye credit card castle diner hut elk hard rock artist nun dog breed nest drama film number icon water tank giraffe altar pavilion tv personality suv street vendor street sign ditch debris foam takeoff spice mountain lake tea orchestra spacecraft counter abbey mountain hydrangea racer orange tree tide cowboy hat rapid town wild herd vein driveway jar bark illustration horror film corn stroller industry mountain stream gym neckline pan client spectator eggplant camper fawn hoodie meat lemonade food market slum comic book character flower market love palace gun heel shopping street shooting basketball guard family photo rooftop laundry basket airport runway horn face mask flight appetizer violet country lane cement instrument tv actor spark celebrity award country house standing auction date engagement puck advertisement chair zebra driftwood bumblebee maple leaf bonnet orange water tower door singer floor plan discussion theatre pilgrim mug branch window sill baseball pitcher bakery lollipop basketball player toilet paper chalkboard cabin sign night sky cannon fishing net submarine suit fur coat wine bottle folder street art suspension bridge evening sky billboard postage stamp newspaper transportation surgeon light park horizon road sand bar trumpet lounge cloud forest birthday celebration balcony anime beehive umbrella goldfish baseball cap waterhole ceiling carousel backpack plant pot atmosphere sunflower field spire vision woodpecker chip pool table lotus flower cone humpback whale reservoir hunt piano plate dining area luggage skier dance floor crow stair overpass opera house bear jazz artist water vessel cast yard cathedral basketball hoop graveyard sound berry onlooker fauna birch tree retail hill skeleton journalist frost basket nail dusk trash dawn clover hen volcano basketball coach home decor charge haircut sense university lizard daisy tablet computer grass field prison metal artist bathroom mirror window frame chest flavor pop country artist market square monkey blog deer speech bubble dog independence day girl boy tartan furniture appliance office window fish boat sand box tv sitcom drama sleigh depression paper towel baseball protestor grape wedding cake invitation accessory pick grandparent racket tea plantation outdoors egg glass bowl sun organization lion panel station wallpaper helicopter salt vanity patio lunch street performer mountain range soup bacon power station cantilever bridge hummingbird shirt rope hip chalk pendant choir tv lichen railway bridge art gallery bartender wagon baby elephant accordion horseshoe building site clutch harvest savanna geranium business woman paddock patch beech tree war suburbs hospital bed motorcycle racer moss gravel government agency dollar bill father fjord concert nut wedding photography finish line home plate food nose thumb village dining room table bumper monster blackberry lime conflict gala wallet wrist hug mermaid lava lawyer folk rock artist arena onion toothbrush fashion perfume flip triangle woodland mail grasshopper studio wood floor den racquet cello lemur astronaut glass table blood dvd planter silver leash master bedroom forest batter shoe engraving opening product toe cocktail mallard duck bike ride oasis wedding ring cinematographer holly autograph fence ice cube cove pineapple aurora glass bead produce apartment building cob miniature cockpit flashlight frog sheep groom steel watermelon clip art paper plate ostrich contour mural cub paisley bandanna winery turn handle satellite post pork child asphalt grocery store vulture trolley nightclub brick trailer compass cereal cafe cartoon character sugar fiction book glass floor umpire guitar hamster protester airplane garment blazer railway line wedding shoe box parking lot construction graduation ceremony tram telescope copper pain autumn forest guest house partner crayon dip boot corridor computer keyboard hockey player chicken coop bus station gathering ankle bunk bed wood table football coach monarch pharmacy legging mannequin female train track stack canopy design element grandmother symbol beach hut zucchini bomb businessman skyscraper tongue case sparkle highland ballroom prom estate customer archipelago cheese debate carriage bulldozer pumpkin sitting room gas station wedding reception camp dog bed tower property river bed pop latin artist fridge wine glass coast beer tow truck fire truck mountain bike thigh heron boat ride gondola turquoise lake llama kitty tin waiting room coffee cup socialite guard tap waterway forehead list erosion box sea lion pollen dam wasp salon tennis tournament flower box aquarium rain cloud clothing store lead singer cupcake tortoise lettering sport facility dance dog house nature football rooster footballer railway track crowd fishing rod silhouette wind turbine sari bus window cloud charity medal yoga event veil fashion menswear milan week news knife print screen tv walnut fungus ice cream computer mouse play tribe picture video game business card music festival rack envelope shower dirt road mine oyster monarch butterfly dude fruit salad podium fork lace test match boulder cricket player staircase peninsula shopping popcorn oak market stall pine tree mountaineer student closet hood handstand centerpiece insect patient makeover tennis player sheet park bench apple organism hook turkey tangerine sibling shopping mall bird scarf smoothie net grass napkin ray eyebrow laptop keyboard motorbike woman hand oven book cover easter egg microwave sand snapshot soccer ball makeup knight bowling ball shower curtain flame lightning running power plant crib cartoon moat fashion girl wedding invitation bottle cliff monastery file photo apartment casino cream sweatshirt storm cruise teddy bear shovel wind farm writer dock professional hotel room job monitor donkey pass interview duchess mark plank beard zombie trio channel cricket team windmill vest diagram cable winter scene golden gate bridge buffalo studio portrait pagoda whiskey freight train kite future steam train phone box headset wood snowboarder paper bag slide grapefruit seating morning bronze sculpture theatre actor stump jean landmark jam waist watercolor hammock light fixture ice basin beverage shelter premiere mound ear bronze sunlight street energy barn door hike fleet claw beach pepperoni bin trainer buffet archive toddler referee bay window dove production company evening light gate farm reed fruit stand explorer snow storm throw pillow button display case bookcase lead lipstick basketball court cargo ensemble pope clock tower teen speaker rat laptop ski mess stadium ferry boat bunny waterfront downtown sink press conference dinner condiment thread audience grid car plastic people barbecue pigeon urinal seagull volunteer hockey fir tree pollution trial collar area meeting room circus yogurt orangutan viaduct comedian drone scissor pop rock artist biscuit panda water feature air balloon remote control watercolor painting show walk post office bike path rap gangsta artist microphone crack sunset sky glass tv show cartoon style stripe foyer signal calligraphy bulb gardener coffee bean spider tapestry city skyline necklace kitten traveler veteran frosting fry tennis court tank top butterfly house mist drummer water level scale baseball glove music video performer champagne camping clothing water drop telephone box pen morning mist fire engine porch opening ceremony style palm tree fashion show universe scratch axe ottoman explosion rib boutique game cucumber fruit stone bridge nature reserve track train window punch telephone pole velvet sauce moon contrast flamingo bat vending machine ship equestrian shade comforter pallet sparrow wii glaze grocery steeple soccer player contract advertising runner chimpanzee world seat project chihuahua bubble willow pedestal soul hip hop artist curb drawer leaf banner launch party coach government snowball toy portrait doctor whiteboard electronic tiger graffiti column nightstand whistle maxi dress bench wetsuit bird feeder football game basketball class bathroom door store window text message wreath street view binocular pet facade drought lemon new year night view airplane window specie rule jaw wheat field diet pop artist habitat screenshot scoreboard shore mane quilt ski lift orchid turban christmas airport marina glass door glass bottle restaurant conductor logo sleep tape tomato river bank lilac tooth training pottery shop steam engine mason jar base procession border shoot footprint hotdog bull stocking recreation automobile model design country pop artist river retriever department store auditorium sport car supermarket belt cricket window box dress shirt letter residence megaphone pant wildfire bird nest crab swimsuit candle funeral mill national park plant cop power line perch blue finger ferris wheel globe skateboard helmet movie theater uniform hammer material kid well butterfly sideline fashion fall show planet earth lift male sauna gray flour sand sculpture program cabinet infant wheel aircraft model dough garlic skate arrow wrapping paper ripple lamp iron banknote beaver ferry courtyard bassist countryside steak comfort boxer laundry room campsite brick building golf subway headphone fort handbag drum flood saddle bass labyrinth needle sun ray app menu president cardigan dandelion wetland ice hockey player number city hall fishing portrait session pug key art print minister hurdle emergency painting artist flag pole evening purse recipe golf ball coloring book mountain peak senior holiday bud cousin pantry lap skin flag tissue paper ridge wire fence surfer climber photograph sewing machine cooler actress apple tree cancer starfish automobile make dumbbell brace tunnel window paint artist composition school student condo convertible cushion selfie territory guide tree court shrimp stone house dress eyelash juice broccoli chain tourism mountain top concept car film premiere light bulb cafeteria badge flower bed theater root racecar driver basketball boy game glove skyline wall glacier airport terminal bug trim railway station briefcase flat fountain person lane asparagus art lantern dishwasher director snake lecture game controller tree branch pub bathing suit queue belly poppy bow pitcher ice cream cone cave candy road bridge host traffic jam earring file foot watermark overlay stamp mailbox supercar railing bedroom seafood waffle bronze statue plan flow marble basketball game automobile scene cypress tree soldier skateboarder glass building cherry tree pump grain wildebeest loop frame bathtub saxophone diver stalk lily bead alley flock family room manufacturing pointer worker navy potato teacher photography dolly boardwalk water fountain athlete side dish bay ice hockey phone hero face gold medal blind swamp researcher swim meatball iguana leather jacket jellyfish site smoke traffic signal melon beetle calculator skirt plantation sculptor barrier catcher security guard sketch awning steering wheel mountain view bus stop pool leg spotlight apron mineral inlet sleeve torch emotion march police officer performance lamp post fishing boat summer presentation saucer suitcase supermodel goalkeeper shrub rock artist document beach house man blue artist cigar railroad track gown mosaic bungalow alphabet baseball field shed pedestrian rail soap kitchen counter dessert dunk blossom conversation fruit market glass jar military beer bottle photographer tennis racket competition escalator bell tower stilt ballerina television feather fence post rear dahlia red carpet tub hole fortress pack telephone cardboard city park platform college student arch bridge wind blender bloom ice rink birthday raven fairy embankment hall flower shop suburb barrel biker steam dragonfly formation electricity business people symmetry walkway fisherman gas mask loch youth hanger dot fish street market animation film crime fiction film boar emblem halloween costume kangaroo couple spoon squirrel neon sign sky office desk beauty salon breakwater fashion look toaster author news conference outdoor canoe dragon tool shopping centre ladybug swimming pool landscaping ski pole red truck fly temple level sunday railroad bridge car mirror lawn mower flute aircraft carrier fashion menswear london week sunshine tile floor skull fossil flower arrangement diaper sea turtle cherry blossom fireman shack lens waiter animal basement snow autumn park glass box kick head anniversary vine back paper lantern fish tank cellphone silk coral notebook photo gazebo ketchup driver farmer bonfire chestnut photoshoot football field olive tree pheasant sandal toilet fireplace music deity fish market fig bell neck grave villa cyclist crate grey asphalt road soccer hostel municipality courthouse roof end table pot sedan structure folk artist sport sport team protest syringe fashion designer jersey heart shape kayak stare sit with direct read photograph spin teach laugh carve grow on warm watch stretch smell decorate shine light dance send park chase collect lead kiss lead to lick smile cheer sit point block rock drop cut ski wrap lose serve provide sleep dress embrace burn pack stir create touch wash stick reveal shop train paint groom hunt bloom play pay brush shoot hold picture carry sip contain turn pour pitch give add blow look in show walk illuminate kneel cover drag post present fit operate fish race write deliver peel push run sit around buy jump walk on attend clean sell ride on mount host dry plant sing row shake perch ride fight skateboard live call surround practice play on work on step relax hit fall in flow greet launch wear hang on drive sit in break learn fly connect display locate compete go for sail lift toast help run on reflect pose scratch frame dribble herd enter exit place inspect build pick fill grind skate offer float sit by stand release rest singe climb tie mark lay stand around capture set land swinge run in kick lean head sign approach swim close crash control fall remove repair open appear travel load miss check surf moor smoke drink board seat feed rise sit on swing grow strike date slide share graze jump in lie extrude roll move gather eat pull run through squeeze lay on draw play with wave assemble perform march score attach adjust hang hug sleep on throw live in talk pet work run with see flip catch cook receive celebrate look classic bridal indoor industrial teenage mini grassy aged long warm light handsome happy three pregnant circular urban silver ceramic 3d green blonde golden dark tropical ripe deep fat musical giant medical medieval bare stunning bold geographical huge plastic foggy stormy gothic biological empty clear antique pink steep brown striped aerial rainy cool flying commercial purple trendy blank haired dead wooden flat high beige panoramic angry dozen rural solar big small stained thick many fresh clean strong abstract crowded retro dry gorgeous martial modern blue cloudy low four outdoor single much beautiful snowy pretty new short sunny closed rocky red two double male gray five colorful automotive various one old rusty tall wild narrow natural several frozen textured lush young hot mixed white float quiet round bright religious female historical shiny traditional tourist yellow bald coastal lovely little broken romantic wide royal rich open cute ancient cold political elderly gold full rustic metallic floral sad wet fancy senior tiny stylish large frosty orange transparent electronic shallow scared armed dirty historic black few windy some square ornamental sandy thin ================================================ FILE: ram/models/__init__.py ================================================ from .ram import ram from .tag2text import tag2text ================================================ FILE: ram/models/bert.py ================================================ ''' * Copyright (c) 2022, salesforce.com, inc. * All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause * By Junnan Li * Based on huggingface code base * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert ''' import math import os import warnings from dataclasses import dataclass from typing import Optional, Tuple import torch from torch import Tensor, device, dtype, nn import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss import torch.nn.functional as F from transformers.activations import ACT2FN from transformers.file_utils import ( ModelOutput, ) from transformers.modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions, CausalLMOutputWithCrossAttentions, MaskedLMOutput, MultipleChoiceModelOutput, NextSentencePredictorOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, ) from transformers.modeling_utils import ( PreTrainedModel, apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer, ) from transformers.utils import logging from transformers.models.bert.configuration_bert import BertConfig logger = logging.get_logger(__name__) class BertEmbeddings_nopos(nn.Module): """Construct the embeddings from word and position embeddings.""" def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) # self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized # self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) # self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.config = config def forward( self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 ): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] seq_length = input_shape[1] # if position_ids is None: # position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) embeddings = inputs_embeds # if self.position_embedding_type == "absolute": # position_embeddings = self.position_embeddings(position_ids) # # print('add position_embeddings!!!!') # embeddings += position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class BertEmbeddings(nn.Module): """Construct the embeddings from word and position embeddings.""" def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.config = config def forward( self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 ): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] seq_length = input_shape[1] if position_ids is None: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) embeddings = inputs_embeds if self.position_embedding_type == "absolute": position_embeddings = self.position_embeddings(position_ids) # print('add position_embeddings!!!!') embeddings += position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class BertSelfAttention(nn.Module): def __init__(self, config, is_cross_attention): super().__init__() self.config = config if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) if is_cross_attention: self.key = nn.Linear(config.encoder_width, self.all_head_size) self.value = nn.Linear(config.encoder_width, self.all_head_size) else: self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": self.max_position_embeddings = config.max_position_embeddings self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.save_attention = False def save_attn_gradients(self, attn_gradients): self.attn_gradients = attn_gradients def get_attn_gradients(self): return self.attn_gradients def save_attention_map(self, attention_map): self.attention_map = attention_map def get_attention_map(self): return self.attention_map def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None if is_cross_attention: # print(self.key.weight.shape) key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) attention_mask = encoder_attention_mask elif past_key_value is not None: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) key_layer = torch.cat([past_key_value[0], key_layer], dim=2) value_layer = torch.cat([past_key_value[1], value_layer], dim=2) else: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) query_layer = self.transpose_for_scores(mixed_query_layer) past_key_value = (key_layer, value_layer) # compatible with higher versions of transformers if key_layer.shape[0] > query_layer.shape[0]: key_layer = key_layer[:query_layer.shape[0], :, :, :] attention_mask = attention_mask[:query_layer.shape[0], :, :] value_layer = value_layer[:query_layer.shape[0], :, :, :] # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": seq_length = hidden_states.size()[1] position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) distance = position_ids_l - position_ids_r positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility if self.position_embedding_type == "relative_key": relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores elif self.position_embedding_type == "relative_key_query": relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) if is_cross_attention and self.save_attention: self.save_attention_map(attention_probs) attention_probs.register_hook(self.save_attn_gradients) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs_dropped = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs_dropped = attention_probs_dropped * head_mask context_layer = torch.matmul(attention_probs_dropped, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) outputs = outputs + (past_key_value,) return outputs class BertSelfOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertAttention(nn.Module): def __init__(self, config, is_cross_attention=False): super().__init__() self.self = BertSelfAttention(config, is_cross_attention) self.output = BertSelfOutput(config) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return heads, index = find_pruneable_heads_and_indices( heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads ) # Prune linear layers self.self.query = prune_linear_layer(self.self.query, index) self.self.key = prune_linear_layer(self.self.key, index) self.self.value = prune_linear_layer(self.self.value, index) self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) # Update hyper params and store pruned heads self.self.num_attention_heads = self.self.num_attention_heads - len(heads) self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, ): self_outputs = self.self( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions, ) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class BertIntermediate(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states class BertOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertLayer(nn.Module): def __init__(self, config, layer_num): super().__init__() self.config = config self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.attention = BertAttention(config) self.layer_num = layer_num if self.config.add_cross_attention: self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, mode=None, ): if mode == 'tagging': assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers" cross_attention_outputs = self.crossattention( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions=output_attentions, ) attention_output = cross_attention_outputs[0] outputs = cross_attention_outputs[1:-1] # add cross attentions if we output attention weights present_key_value = cross_attention_outputs[-1] else: # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None self_attention_outputs = self.attention( hidden_states, attention_mask, head_mask, output_attentions=output_attentions, past_key_value=self_attn_past_key_value, ) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:-1] present_key_value = self_attention_outputs[-1] if mode=='multimodal': assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers" cross_attention_outputs = self.crossattention( attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions=output_attentions, ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights layer_output = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output ) outputs = (layer_output,) + outputs outputs = outputs + (present_key_value,) return outputs def feed_forward_chunk(self, attention_output): intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) return layer_output class BertEncoder(nn.Module): def __init__(self, config): super().__init__() self.config = config self.layer = nn.ModuleList([BertLayer(config,i) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=False, output_hidden_states=False, return_dict=True, mode='multimodal', ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None next_decoder_cache = () if use_cache else None for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_head_mask = head_mask[i] if head_mask is not None else None past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: if use_cache: logger.warn( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs, past_key_value, output_attentions) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, mode=mode, ) else: layer_outputs = layer_module( hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions, mode=mode, ) hidden_states = layer_outputs[0] if use_cache: next_decoder_cache += (layer_outputs[-1],) if output_attentions: all_self_attentions = all_self_attentions + (layer_outputs[1],) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: return tuple( v for v in [ hidden_states, next_decoder_cache, all_hidden_states, all_self_attentions, all_cross_attentions, ] if v is not None ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, attentions=all_self_attentions, cross_attentions=all_cross_attentions, ) class BertPooler(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() def forward(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) pooled_output = self.activation(pooled_output) return pooled_output class BertPredictionHeadTransform(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) if isinstance(config.hidden_act, str): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class BertLMPredictionHead(nn.Module): def __init__(self, config): super().__init__() self.transform = BertPredictionHeadTransform(config) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) return hidden_states class BertOnlyMLMHead(nn.Module): def __init__(self, config): super().__init__() self.predictions = BertLMPredictionHead(config) def forward(self, sequence_output): prediction_scores = self.predictions(sequence_output) return prediction_scores class BertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = BertConfig base_model_prefix = "bert" _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() class BertModel(BertPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an input to the forward pass. """ def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if add_pooling_layer else None self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor: """ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. Arguments: attention_mask (:obj:`torch.Tensor`): Mask with ones indicating tokens to attend to, zeros for tokens to ignore. input_shape (:obj:`Tuple[int]`): The shape of the input to the model. device: (:obj:`torch.device`): The device of the input to the model. Returns: :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. """ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] if is_decoder: batch_size, seq_length = input_shape seq_ids = torch.arange(seq_length, device=device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] # in case past_key_values are used we need to add a prefix ones mask to the causal mask # causal and attention masks must have same type with pytorch version < 1.3 causal_mask = causal_mask.to(attention_mask.dtype) if causal_mask.shape[1] < attention_mask.shape[1]: prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] causal_mask = torch.cat( [ torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype), causal_mask, ], axis=-1, ) extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] else: extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( input_shape, attention_mask.shape ) ) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask def forward( self, input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, is_decoder=False, mode='multimodal', ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: use_cache = False if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() batch_size, seq_length = input_shape device = input_ids.device elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size, seq_length = input_shape device = inputs_embeds.device elif encoder_embeds is not None: input_shape = encoder_embeds.size()[:-1] batch_size, seq_length = input_shape device = encoder_embeds.device else: raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds") # past_key_values_length past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 if attention_mask is None: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device, is_decoder) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if encoder_hidden_states is not None: if type(encoder_hidden_states) == list: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() else: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if type(encoder_attention_mask) == list: encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] elif encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) if encoder_embeds is None: embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) else: embedding_output = encoder_embeds encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, mode=mode, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) class BertLMHeadModel(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] def __init__(self, config): super().__init__(config) self.bert = BertModel(config, add_pooling_layer=False) self.cls = BertOnlyMLMHead(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings def forward( self, input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, return_logits=False, is_decoder=True, reduction='mean', mode='multimodal', ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). Returns: Example:: >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') >>> config = BertConfig.from_pretrained("bert-base-cased") >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> prediction_logits = outputs.logits """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False outputs = self.bert( input_ids, attention_mask=attention_mask, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, is_decoder=is_decoder, mode=mode, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) # sequence_output.shape torch.Size([85, 30, 768]) # prediction_scores.shape torch.Size([85, 30, 30524]) # labels.shape torch.Size([85, 30]) if return_logits: return prediction_scores[:, :-1, :].contiguous() lm_loss = None if labels is not None: # we are doing next-token prediction; shift prediction scores and input ids by one shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() labels = labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if reduction=='none': lm_loss = lm_loss.view(prediction_scores.size(0),-1).sum(1) if not return_dict: output = (prediction_scores,) + outputs[2:] return ((lm_loss,) + output) if lm_loss is not None else output return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs): input_shape = input_ids.shape # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_shape) # cut decoder_input_ids if past is used if past is not None: input_ids = input_ids[:, -1:] return { "input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past, "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), "is_decoder": True, } def _reorder_cache(self, past, beam_idx): reordered_past = () for layer_past in past: reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) return reordered_past ================================================ FILE: ram/models/bert_lora.py ================================================ ''' * Copyright (c) 2022, salesforce.com, inc. * All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause * By Junnan Li * Based on huggingface code base * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert ''' import math import os import warnings from dataclasses import dataclass from typing import Optional, Tuple import torch from torch import Tensor, device, dtype, nn import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss import torch.nn.functional as F from transformers.activations import ACT2FN from transformers.file_utils import ( ModelOutput, ) from transformers.modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions, CausalLMOutputWithCrossAttentions, MaskedLMOutput, MultipleChoiceModelOutput, NextSentencePredictorOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, ) from transformers.modeling_utils import ( PreTrainedModel, apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer, ) from transformers.utils import logging from transformers.models.bert.configuration_bert import BertConfig import loralib as lora logger = logging.get_logger(__name__) class BertEmbeddings_nopos(nn.Module): """Construct the embeddings from word and position embeddings.""" def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) # self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized # self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) # self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.config = config def forward( self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 ): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] seq_length = input_shape[1] # if position_ids is None: # position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) embeddings = inputs_embeds # if self.position_embedding_type == "absolute": # position_embeddings = self.position_embeddings(position_ids) # # print('add position_embeddings!!!!') # embeddings += position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class BertEmbeddings(nn.Module): """Construct the embeddings from word and position embeddings.""" def __init__(self, config): super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.config = config def forward( self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 ): if input_ids is not None: input_shape = input_ids.size() else: input_shape = inputs_embeds.size()[:-1] seq_length = input_shape[1] if position_ids is None: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) embeddings = inputs_embeds if self.position_embedding_type == "absolute": position_embeddings = self.position_embeddings(position_ids) # print('add position_embeddings!!!!') embeddings += position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class BertSelfAttention(nn.Module): def __init__(self, config, is_cross_attention): super().__init__() self.config = config if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size # self.query = nn.Linear(config.hidden_size, self.all_head_size) self.query = lora.Linear(config.hidden_size, self.all_head_size, r=8) if is_cross_attention: # self.key = nn.Linear(config.encoder_width, self.all_head_size) self.key = lora.Linear(config.encoder_width, self.all_head_size, r=8) self.value = nn.Linear(config.encoder_width, self.all_head_size) else: # self.key = nn.Linear(config.hidden_size, self.all_head_size) self.key = lora.Linear(config.hidden_size, self.all_head_size, r=8) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": self.max_position_embeddings = config.max_position_embeddings self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.save_attention = False def save_attn_gradients(self, attn_gradients): self.attn_gradients = attn_gradients def get_attn_gradients(self): return self.attn_gradients def save_attention_map(self, attention_map): self.attention_map = attention_map def get_attention_map(self): return self.attention_map def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None if is_cross_attention: # print(self.key.weight.shape) key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) attention_mask = encoder_attention_mask elif past_key_value is not None: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) key_layer = torch.cat([past_key_value[0], key_layer], dim=2) value_layer = torch.cat([past_key_value[1], value_layer], dim=2) else: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) query_layer = self.transpose_for_scores(mixed_query_layer) past_key_value = (key_layer, value_layer) # compatible with higher versions of transformers if key_layer.shape[0] > query_layer.shape[0]: key_layer = key_layer[:query_layer.shape[0], :, :, :] attention_mask = attention_mask[:query_layer.shape[0], :, :] value_layer = value_layer[:query_layer.shape[0], :, :, :] # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": seq_length = hidden_states.size()[1] position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) distance = position_ids_l - position_ids_r positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility if self.position_embedding_type == "relative_key": relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores elif self.position_embedding_type == "relative_key_query": relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) if is_cross_attention and self.save_attention: self.save_attention_map(attention_probs) attention_probs.register_hook(self.save_attn_gradients) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs_dropped = self.dropout(attention_probs) # Mask heads if we want to if head_mask is not None: attention_probs_dropped = attention_probs_dropped * head_mask context_layer = torch.matmul(attention_probs_dropped, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) outputs = outputs + (past_key_value,) return outputs class BertSelfOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertAttention(nn.Module): def __init__(self, config, is_cross_attention=False): super().__init__() self.self = BertSelfAttention(config, is_cross_attention) self.output = BertSelfOutput(config) self.pruned_heads = set() def prune_heads(self, heads): if len(heads) == 0: return heads, index = find_pruneable_heads_and_indices( heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads ) # Prune linear layers self.self.query = prune_linear_layer(self.self.query, index) self.self.key = prune_linear_layer(self.self.key, index) self.self.value = prune_linear_layer(self.self.value, index) self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) # Update hyper params and store pruned heads self.self.num_attention_heads = self.self.num_attention_heads - len(heads) self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, ): self_outputs = self.self( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions, ) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs class BertIntermediate(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) return hidden_states class BertOutput(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states class BertLayer(nn.Module): def __init__(self, config, layer_num): super().__init__() self.config = config self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.attention = BertAttention(config) self.layer_num = layer_num if self.config.add_cross_attention: self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention) self.intermediate = BertIntermediate(config) self.output = BertOutput(config) def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_value=None, output_attentions=False, mode=None, ): if mode == 'tagging': assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers" cross_attention_outputs = self.crossattention( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions=output_attentions, ) attention_output = cross_attention_outputs[0] outputs = cross_attention_outputs[1:-1] # add cross attentions if we output attention weights present_key_value = cross_attention_outputs[-1] else: # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None self_attention_outputs = self.attention( hidden_states, attention_mask, head_mask, output_attentions=output_attentions, past_key_value=self_attn_past_key_value, ) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:-1] present_key_value = self_attention_outputs[-1] if mode=='multimodal': assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers" cross_attention_outputs = self.crossattention( attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions=output_attentions, ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights layer_output = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output ) outputs = (layer_output,) + outputs outputs = outputs + (present_key_value,) return outputs def feed_forward_chunk(self, attention_output): intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) return layer_output class BertEncoder(nn.Module): def __init__(self, config): super().__init__() self.config = config self.layer = nn.ModuleList([BertLayer(config,i) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=False, output_hidden_states=False, return_dict=True, mode='multimodal', ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None next_decoder_cache = () if use_cache else None for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_head_mask = head_mask[i] if head_mask is not None else None past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: if use_cache: logger.warn( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs, past_key_value, output_attentions) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, mode=mode, ) else: layer_outputs = layer_module( hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions, mode=mode, ) hidden_states = layer_outputs[0] if use_cache: next_decoder_cache += (layer_outputs[-1],) if output_attentions: all_self_attentions = all_self_attentions + (layer_outputs[1],) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: return tuple( v for v in [ hidden_states, next_decoder_cache, all_hidden_states, all_self_attentions, all_cross_attentions, ] if v is not None ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, attentions=all_self_attentions, cross_attentions=all_cross_attentions, ) class BertPooler(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() def forward(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] pooled_output = self.dense(first_token_tensor) pooled_output = self.activation(pooled_output) return pooled_output class BertPredictionHeadTransform(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) if isinstance(config.hidden_act, str): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class BertLMPredictionHead(nn.Module): def __init__(self, config): super().__init__() self.transform = BertPredictionHeadTransform(config) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward(self, hidden_states): hidden_states = self.transform(hidden_states) hidden_states = self.decoder(hidden_states) return hidden_states class BertOnlyMLMHead(nn.Module): def __init__(self, config): super().__init__() self.predictions = BertLMPredictionHead(config) def forward(self, sequence_output): prediction_scores = self.predictions(sequence_output) return prediction_scores class BertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = BertConfig base_model_prefix = "bert" _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """ Initialize the weights """ if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() class BertModel(BertPreTrainedModel): """ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an input to the forward pass. """ def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) if add_pooling_layer else None self.init_weights() def get_input_embeddings(self): return self.embeddings.word_embeddings def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor: """ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. Arguments: attention_mask (:obj:`torch.Tensor`): Mask with ones indicating tokens to attend to, zeros for tokens to ignore. input_shape (:obj:`Tuple[int]`): The shape of the input to the model. device: (:obj:`torch.device`): The device of the input to the model. Returns: :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. """ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] if is_decoder: batch_size, seq_length = input_shape seq_ids = torch.arange(seq_length, device=device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] # in case past_key_values are used we need to add a prefix ones mask to the causal mask # causal and attention masks must have same type with pytorch version < 1.3 causal_mask = causal_mask.to(attention_mask.dtype) if causal_mask.shape[1] < attention_mask.shape[1]: prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] causal_mask = torch.cat( [ torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype), causal_mask, ], axis=-1, ) extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] else: extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( input_shape, attention_mask.shape ) ) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask def forward( self, input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, is_decoder=False, mode='multimodal', ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: use_cache = False if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() batch_size, seq_length = input_shape device = input_ids.device elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size, seq_length = input_shape device = inputs_embeds.device elif encoder_embeds is not None: input_shape = encoder_embeds.size()[:-1] batch_size, seq_length = input_shape device = encoder_embeds.device else: raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds") # past_key_values_length past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 if attention_mask is None: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device, is_decoder) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if encoder_hidden_states is not None: if type(encoder_hidden_states) == list: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() else: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if type(encoder_attention_mask) == list: encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] elif encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) if encoder_embeds is None: embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) else: embedding_output = encoder_embeds encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, mode=mode, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) class BertLMHeadModel(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] def __init__(self, config): super().__init__(config) self.bert = BertModel(config, add_pooling_layer=False) self.cls = BertOnlyMLMHead(config) self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings def forward( self, input_ids=None, attention_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, labels=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, return_logits=False, is_decoder=True, reduction='mean', mode='multimodal', ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). Returns: Example:: >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') >>> config = BertConfig.from_pretrained("bert-base-cased") >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> prediction_logits = outputs.logits """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False outputs = self.bert( input_ids, attention_mask=attention_mask, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, is_decoder=is_decoder, mode=mode, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) # sequence_output.shape torch.Size([85, 30, 768]) # prediction_scores.shape torch.Size([85, 30, 30524]) # labels.shape torch.Size([85, 30]) if return_logits: return prediction_scores[:, :-1, :].contiguous() lm_loss = None if labels is not None: # we are doing next-token prediction; shift prediction scores and input ids by one shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() labels = labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if reduction=='none': lm_loss = lm_loss.view(prediction_scores.size(0),-1).sum(1) if not return_dict: output = (prediction_scores,) + outputs[2:] return ((lm_loss,) + output) if lm_loss is not None else output return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, ) def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs): input_shape = input_ids.shape # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_shape) # cut decoder_input_ids if past is used if past is not None: input_ids = input_ids[:, -1:] return { "input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past, "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), "is_decoder": True, } def _reorder_cache(self, past, beam_idx): reordered_past = () for layer_past in past: reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) return reordered_past ================================================ FILE: ram/models/ram.py ================================================ ''' * The Recognize Anything Model (RAM) * Written by Xinyu Huang ''' import json import warnings import numpy as np import torch from torch import nn from .bert import BertConfig, BertLMHeadModel, BertModel from .swin_transformer import SwinTransformer from .utils import * warnings.filterwarnings("ignore") class RAM(nn.Module): def __init__(self, med_config=f'{CONFIG_PATH}/configs/med_config.json', image_size=384, vit='base', vit_grad_ckpt=False, vit_ckpt_layer=0, prompt='a picture of ', threshold=0.68, delete_tag_index=[], tag_list=f'{CONFIG_PATH}/data/ram_tag_list.txt', tag_list_chinese=f'{CONFIG_PATH}/data/ram_tag_list_chinese.txt'): r""" The Recognize Anything Model (RAM) inference module. RAM is a strong image tagging model, which can recognize any common category with high accuracy. Described in the paper " Recognize Anything: A Strong Image Tagging Model" https://recognize-anything.github.io/ Args: med_config (str): path for the mixture of encoder-decoder model's configuration file image_size (int): input image size vit (str): model size of vision transformer threshold (int): tagging threshold delete_tag_index (list): delete some tags that may disturb captioning """ super().__init__() # create image encoder if vit == 'swin_b': if image_size == 224: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_224.json' elif image_size == 384: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_384.json' vision_config = read_json(vision_config_path) assert image_size == vision_config['image_res'] # assert config['patch_size'] == 32 vision_width = vision_config['vision_width'] self.visual_encoder = SwinTransformer( img_size=vision_config['image_res'], patch_size=4, in_chans=3, embed_dim=vision_config['embed_dim'], depths=vision_config['depths'], num_heads=vision_config['num_heads'], window_size=vision_config['window_size'], mlp_ratio=4., qkv_bias=True, drop_rate=0.0, drop_path_rate=0.1, ape=False, patch_norm=True, use_checkpoint=False) elif vit == 'swin_l': if image_size == 224: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_224.json' elif image_size == 384: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_384.json' elif image_size == 444: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_444.json' vision_config = read_json(vision_config_path) assert image_size == vision_config['image_res'] # assert config['patch_size'] == 32 vision_width = vision_config['vision_width'] self.visual_encoder = SwinTransformer( img_size=vision_config['image_res'], patch_size=4, in_chans=3, embed_dim=vision_config['embed_dim'], depths=vision_config['depths'], num_heads=vision_config['num_heads'], window_size=vision_config['window_size'], mlp_ratio=4., qkv_bias=True, drop_rate=0.0, drop_path_rate=0.1, ape=False, patch_norm=True, use_checkpoint=False) else: self.visual_encoder, vision_width = create_vit( vit, image_size, vit_grad_ckpt, vit_ckpt_layer) # create tokenzier self.tokenizer = init_tokenizer() # Tag2Text employ encoder-decoder architecture for image-tag-text generation: image-tag interaction encoder and image-tag-text decoder # create image-tag interaction encoder encoder_config = BertConfig.from_json_file(med_config) encoder_config.encoder_width = 512 self.tag_encoder = BertModel(config=encoder_config, add_pooling_layer=False) # create image-tag-text decoder decoder_config = BertConfig.from_json_file(med_config) self.text_decoder = BertLMHeadModel(config=decoder_config) self.delete_tag_index = delete_tag_index self.prompt = prompt self.prompt_length = len(self.tokenizer(self.prompt).input_ids) - 1 # load tag list self.tag_list = self.load_tag_list(tag_list) self.tag_list_chinese = self.load_tag_list(tag_list_chinese) # create image-tag recognition decoder self.threshold = threshold self.num_class = len(self.tag_list) q2l_config = BertConfig.from_json_file(f'{CONFIG_PATH}/configs/q2l_config.json') q2l_config.encoder_width = 512 self.tagging_head = BertModel(config=q2l_config, add_pooling_layer=False) self.tagging_head.resize_token_embeddings(len(self.tokenizer)) # self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size) self.label_embed = nn.Parameter(torch.zeros(self.num_class, q2l_config.encoder_width)) if q2l_config.hidden_size != 512: self.wordvec_proj = nn.Linear(512, q2l_config.hidden_size) else: self.wordvec_proj = nn.Identity() self.fc = nn.Linear(q2l_config.hidden_size, 1) self.del_selfattention() # share weights of the lowest 2-layer of "image-tag interaction encoder" with the "image-tag recogntion decoder" tie_encoder_decoder_weights(self.tag_encoder, self.tagging_head, '', ' ') self.image_proj = nn.Linear(vision_width, 512) # self.label_embed = nn.Parameter(torch.load(f'{CONFIG_PATH}/data/textual_label_embedding.pth',map_location='cpu').float()) # adjust thresholds for some tags self.class_threshold = torch.ones(self.num_class) * self.threshold ram_class_threshold_path = f'{CONFIG_PATH}/data/ram_tag_list_threshold.txt' with open(ram_class_threshold_path, 'r', encoding='utf-8') as f: ram_class_threshold = [float(s.strip()) for s in f] for key,value in enumerate(ram_class_threshold): self.class_threshold[key] = value def load_tag_list(self, tag_list_file): with open(tag_list_file, 'r', encoding="utf-8") as f: tag_list = f.read().splitlines() tag_list = np.array(tag_list) return tag_list # delete self-attention layer of image-tag recognition decoder to reduce computation, follower Query2Label def del_selfattention(self): del self.tagging_head.embeddings for layer in self.tagging_head.encoder.layer: del layer.attention def condition_forward(self, image, threshold=0.68, condition_flag=None, tag_input=None, only_feature=True, ): label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed)) image_embeds = self.image_proj(self.visual_encoder(image)) if only_feature: return image_embeds else: image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # recognized image tags using image-tag recogntiion decoder image_cls_embeds = image_embeds[:, 0, :] image_spatial_embeds = image_embeds[:, 1:, :] bs = image_spatial_embeds.shape[0] label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]).squeeze(-1) targets = torch.where( torch.sigmoid(logits) > self.class_threshold.to(image.device), torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device)) return image_embeds, logits, targets def generate_tag(self, image, threshold=0.68, tag_input=None, ): label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed)) image_embeds = self.image_proj(self.visual_encoder(image)) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # recognized image tags using image-tag recogntiion decoder image_cls_embeds = image_embeds[:, 0, :] image_spatial_embeds = image_embeds[:, 1:, :] bs = image_spatial_embeds.shape[0] label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]).squeeze(-1) targets = torch.where( torch.sigmoid(logits) > self.class_threshold.to(image.device), torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device)) tag = targets.cpu().numpy() tag[:,self.delete_tag_index] = 0 tag_output = [] tag_output_chinese = [] for b in range(bs): index = np.argwhere(tag[b] == 1) token = self.tag_list[index].squeeze(axis=1) # tag_output.append(' | '.join(token)) tag_output.append(', '.join(token)) token_chinese = self.tag_list_chinese[index].squeeze(axis=1) # tag_output_chinese.append(' | '.join(token_chinese)) tag_output_chinese.append(', '.join(token_chinese)) return tag_output, tag_output_chinese def generate_tag_openset(self, image, threshold=0.68, tag_input=None, ): label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed)) image_embeds = self.image_proj(self.visual_encoder(image)) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # recognized image tags using image-tag recogntiion decoder image_cls_embeds = image_embeds[:, 0, :] image_spatial_embeds = image_embeds[:, 1:, :] bs = image_spatial_embeds.shape[0] label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]).squeeze(-1) targets = torch.where( torch.sigmoid(logits) > self.class_threshold.to(image.device), torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device)) tag = targets.cpu().numpy() tag[:,self.delete_tag_index] = 0 tag_output = [] for b in range(bs): index = np.argwhere(tag[b] == 1) token = self.tag_list[index].squeeze(axis=1) tag_output.append(' | '.join(token)) return tag_output # load RAM pretrained model parameters def ram(pretrained='', **kwargs): model = RAM(**kwargs) if pretrained: if kwargs['vit'] == 'swin_b': model, msg = load_checkpoint_swinbase(model, pretrained, kwargs) elif kwargs['vit'] == 'swin_l': model, msg = load_checkpoint_swinlarge(model, pretrained, kwargs) else: model, msg = load_checkpoint(model, pretrained) print('vit:', kwargs['vit']) # print('msg', msg) return model ================================================ FILE: ram/models/ram_lora.py ================================================ ''' * The Recognize Anything Model (RAM) * Written by Xinyu Huang ''' import json import warnings import numpy as np import torch from torch import nn from .bert_lora import BertConfig, BertLMHeadModel, BertModel from .swin_transformer_lora import SwinTransformer from .utils import * warnings.filterwarnings("ignore") class RAMLora(nn.Module): def __init__(self, condition_config=f'{CONFIG_PATH}/configs/condition_config.json', med_config=f'{CONFIG_PATH}/configs/med_config.json', image_size=384, vit='base', vit_grad_ckpt=False, vit_ckpt_layer=0, prompt='a picture of ', threshold=0.68, max_threthold=0.9, add_threthold=0, delete_tag_index=[], tag_list=f'{CONFIG_PATH}/data/ram_tag_list.txt', tag_list_chinese=f'{CONFIG_PATH}/data/ram_tag_list_chinese.txt'): r""" The Recognize Anything Model (RAM) inference module. RAM is a strong image tagging model, which can recognize any common category with high accuracy. Described in the paper " Recognize Anything: A Strong Image Tagging Model" https://recognize-anything.github.io/ Args: med_config (str): path for the mixture of encoder-decoder model's configuration file image_size (int): input image size vit (str): model size of vision transformer threshold (int): tagging threshold delete_tag_index (list): delete some tags that may disturb captioning """ super().__init__() # create image encoder if vit == 'swin_b': if image_size == 224: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_224.json' elif image_size == 384: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_384.json' vision_config = read_json(vision_config_path) assert image_size == vision_config['image_res'] # assert config['patch_size'] == 32 vision_width = vision_config['vision_width'] self.visual_encoder = SwinTransformer( img_size=vision_config['image_res'], patch_size=4, in_chans=3, embed_dim=vision_config['embed_dim'], depths=vision_config['depths'], num_heads=vision_config['num_heads'], window_size=vision_config['window_size'], mlp_ratio=4., qkv_bias=True, drop_rate=0.0, drop_path_rate=0.1, ape=False, patch_norm=True, use_checkpoint=False) elif vit == 'swin_l': if image_size == 224: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_224.json' elif image_size == 384: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_384.json' elif image_size == 444: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_444.json' vision_config = read_json(vision_config_path) assert image_size == vision_config['image_res'] # assert config['patch_size'] == 32 vision_width = vision_config['vision_width'] self.visual_encoder = SwinTransformer( img_size=vision_config['image_res'], patch_size=4, in_chans=3, embed_dim=vision_config['embed_dim'], depths=vision_config['depths'], num_heads=vision_config['num_heads'], window_size=vision_config['window_size'], mlp_ratio=4., qkv_bias=True, drop_rate=0.0, drop_path_rate=0.1, ape=False, patch_norm=True, use_checkpoint=False) else: self.visual_encoder, vision_width = create_vit( vit, image_size, vit_grad_ckpt, vit_ckpt_layer) # create tokenzier self.tokenizer = init_tokenizer() # Tag2Text employ encoder-decoder architecture for image-tag-text generation: image-tag interaction encoder and image-tag-text decoder # create image-tag interaction encoder encoder_config = BertConfig.from_json_file(med_config) encoder_config.encoder_width = 512 self.tag_encoder = BertModel(config=encoder_config, add_pooling_layer=False) # create image-tag-text decoder decoder_config = BertConfig.from_json_file(med_config) self.text_decoder = BertLMHeadModel(config=decoder_config) self.delete_tag_index = delete_tag_index self.prompt = prompt self.prompt_length = len(self.tokenizer(self.prompt).input_ids) - 1 # load tag list self.tag_list = self.load_tag_list(tag_list) self.tag_list_chinese = self.load_tag_list(tag_list_chinese) # create image-tag recognition decoder self.threshold = threshold self.num_class = len(self.tag_list) q2l_config = BertConfig.from_json_file(f'{CONFIG_PATH}/configs/q2l_config.json') q2l_config.encoder_width = 512 self.tagging_head = BertModel(config=q2l_config, add_pooling_layer=False) self.tagging_head.resize_token_embeddings(len(self.tokenizer)) # self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size) self.label_embed = nn.Parameter(torch.zeros(self.num_class, q2l_config.encoder_width)) if q2l_config.hidden_size != 512: self.wordvec_proj = nn.Linear(512, q2l_config.hidden_size) else: self.wordvec_proj = nn.Identity() self.fc = nn.Linear(q2l_config.hidden_size, 1) self.del_selfattention() # share weights of the lowest 2-layer of "image-tag interaction encoder" with the "image-tag recogntion decoder" tie_encoder_decoder_weights(self.tag_encoder, self.tagging_head, '', ' ') self.image_proj = nn.Linear(vision_width, 512) # self.label_embed = nn.Parameter(torch.load(f'{CONFIG_PATH}/data/textual_label_embedding.pth',map_location='cpu').float()) # adjust thresholds for some tags self.class_threshold = torch.ones(self.num_class) * self.threshold print(f'Loading default thretholds from .txt....') ram_class_threshold_path = f'{CONFIG_PATH}/data/ram_tag_list_threshold.txt' with open(ram_class_threshold_path, 'r', encoding='utf-8') as f: ram_class_threshold = [float(s.strip()) for s in f] for key,value in enumerate(ram_class_threshold): if value > max_threthold: self.class_threshold[key] = value else: self.class_threshold[key] = min(value + add_threthold, max_threthold) def load_tag_list(self, tag_list_file): with open(tag_list_file, 'r', encoding="utf-8") as f: tag_list = f.read().splitlines() tag_list = np.array(tag_list) return tag_list # delete self-attention layer of image-tag recognition decoder to reduce computation, follower Query2Label def del_selfattention(self): del self.tagging_head.embeddings for layer in self.tagging_head.encoder.layer: del layer.attention def generate_image_embeds(self, image, condition=False ): image_embeds = self.image_proj(self.visual_encoder(image)) return image_embeds def generate_tag(self, image, threshold=0.68, tag_input=None, ): label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed)) image_embeds = self.image_proj(self.visual_encoder(image)) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # recognized image tags using image-tag recogntiion decoder image_cls_embeds = image_embeds[:, 0, :] image_spatial_embeds = image_embeds[:, 1:, :] bs = image_spatial_embeds.shape[0] label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]).squeeze(-1) targets = torch.where( torch.sigmoid(logits) > self.class_threshold.to(image.device), torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device)) tag = targets.cpu().numpy() tag[:,self.delete_tag_index] = 0 tag_output = [] tag_output_chinese = [] for b in range(bs): index = np.argwhere(tag[b] == 1) token = self.tag_list[index].squeeze(axis=1) # tag_output.append(' | '.join(token)) tag_output.append(', '.join(token)) token_chinese = self.tag_list_chinese[index].squeeze(axis=1) # tag_output_chinese.append(' | '.join(token_chinese)) tag_output_chinese.append(', '.join(token_chinese)) return tag_output, tag_output_chinese def condition_forward(self, image, threshold=0.68, condition_flag=None, tag_input=None, only_feature=True ): label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed)) image_embeds = self.image_proj(self.visual_encoder(image)) if only_feature: return image_embeds else: image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # recognized image tags using image-tag recogntiion decoder image_cls_embeds = image_embeds[:, 0, :] image_spatial_embeds = image_embeds[:, 1:, :] bs = image_spatial_embeds.shape[0] label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]).squeeze(-1) targets = torch.where( torch.sigmoid(logits) > self.class_threshold.to(image.device), torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device)) return image_embeds, logits, targets def generate_tag_openset(self, image, threshold=0.68, tag_input=None, ): label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed)) image_embeds = self.image_proj(self.visual_encoder(image)) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # recognized image tags using image-tag recogntiion decoder image_cls_embeds = image_embeds[:, 0, :] image_spatial_embeds = image_embeds[:, 1:, :] bs = image_spatial_embeds.shape[0] label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]).squeeze(-1) targets = torch.where( torch.sigmoid(logits) > self.class_threshold.to(image.device), torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device)) tag = targets.cpu().numpy() tag[:,self.delete_tag_index] = 0 tag_output = [] for b in range(bs): index = np.argwhere(tag[b] == 1) token = self.tag_list[index].squeeze(axis=1) tag_output.append(' | '.join(token)) return tag_output # load RAM pretrained model parameters def ram(pretrained='', pretrained_condition='', **kwargs): model = RAMLora(**kwargs) if pretrained: if kwargs['vit'] == 'swin_b': model, msg = load_checkpoint_swinbase(model, pretrained, kwargs) elif kwargs['vit'] == 'swin_l': model, msg = load_checkpoint_swinlarge(model, pretrained, kwargs) else: model, msg = load_checkpoint(model, pretrained) print('vit:', kwargs['vit']) if pretrained_condition: model.load_state_dict(torch.load(pretrained_condition), strict=False) print(f'load lora from {pretrained_condition}') return model ================================================ FILE: ram/models/swin_transformer.py ================================================ # -------------------------------------------------------- # Swin Transformer # Copyright (c) 2021 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ze Liu # -------------------------------------------------------- import numpy as np from scipy import interpolate import torch import torch.nn as nn import torch.utils.checkpoint as checkpoint from timm.models.layers import DropPath, to_2tuple, trunc_normal_ class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowAttention(nn.Module): r""" Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = torch.arange(self.window_size[0]) coords_w = torch.arange(self.window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) trunc_normal_(self.relative_position_bias_table, std=.02) self.softmax = nn.Softmax(dim=-1) def forward(self, x, mask=None): """ Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = (q @ k.transpose(-2, -1)) relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B_, N, C) x = self.proj(x) x = self.proj_drop(x) return x def extra_repr(self) -> str: return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}' def flops(self, N): # calculate flops for 1 window with token length of N flops = 0 # qkv = self.qkv(x) flops += N * self.dim * 3 * self.dim # attn = (q @ k.transpose(-2, -1)) flops += self.num_heads * N * (self.dim // self.num_heads) * N # x = (attn @ v) flops += self.num_heads * N * N * (self.dim // self.num_heads) # x = self.proj(x) flops += N * self.dim * self.dim return flops class SwinTransformerBlock(nn.Module): r""" Swin Transformer Block. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resulotion. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.input_resolution = input_resolution self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio if min(self.input_resolution) <= self.window_size: # if window size is larger than input resolution, we don't partition windows self.shift_size = 0 self.window_size = min(self.input_resolution) assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) if self.shift_size > 0: # calculate attention mask for SW-MSA H, W = self.input_resolution img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 mask_windows = mask_windows.view(-1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) else: attn_mask = None self.register_buffer("attn_mask", attn_mask) ## condition from LR self.condition_attention = nn.Sequential( nn.Linear(256, dim*2, bias=False), ) self.condition_ffn = nn.Sequential( nn.Linear(256, dim*2, bias=False), ) zero_module(self.condition_attention) zero_module(self.condition_ffn) def forward(self, x, condition=None): H, W = self.input_resolution B, L, C = x.shape assert L == H * W, "input feature has wrong size" shortcut = x x = self.norm1(x) x = x.view(B, H, W, C) # add condition before attention # input B,H,W,C if condition is not None: x = x.permute(0, 3, 1, 2) # BCHW condition_attention = self.condition_attention(condition).view(-1, 2*C, 1, 1) condition_attn_multiplication, condition_attn_addition = condition_attention.chunk(2, dim=1) x = x*condition_attn_multiplication + condition_attn_multiplication x = x.permute(0, 2, 3, 1) # cyclic shift if self.shift_size > 0: shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) else: shifted_x = x # partition windows x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C # reverse cyclic shift if self.shift_size > 0: x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x x = x.view(B, H * W, C) # FFN x = shortcut + self.drop_path(x) # x = x + self.drop_path(self.mlp(self.norm2(x))) # add condition before ffn # input B,H*W,C if condition is not None: res = x x = self.norm2(x) x = x.view(B, H, W, C) x = x.permute(0, 3, 1, 2) # BCHW condition_ffn = self.condition_ffn(condition).view(-1, 2*C, 1, 1) condition_ffn_multiplication, condition_ffn_addition = condition_ffn.chunk(2, dim=1) x = x*condition_ffn_multiplication + condition_ffn_addition x = x.permute(0, 2, 3, 1) x = x.view(B, H*W, C) x = res + self.drop_path(self.mlp(x)) else: x = x + self.drop_path(self.mlp(self.norm2(x))) return x def extra_repr(self) -> str: return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" def flops(self): flops = 0 H, W = self.input_resolution # norm1 flops += self.dim * H * W # W-MSA/SW-MSA nW = H * W / self.window_size / self.window_size flops += nW * self.attn.flops(self.window_size * self.window_size) # mlp flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio # norm2 flops += self.dim * H * W return flops class PatchMerging(nn.Module): r""" Patch Merging Layer. Args: input_resolution (tuple[int]): Resolution of input feature. dim (int): Number of input channels. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): super().__init__() self.input_resolution = input_resolution self.dim = dim self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) self.norm = norm_layer(4 * dim) def forward(self, x): """ x: B, H*W, C """ H, W = self.input_resolution B, L, C = x.shape assert L == H * W, "input feature has wrong size" assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." x = x.view(B, H, W, C) x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) return x def extra_repr(self) -> str: return f"input_resolution={self.input_resolution}, dim={self.dim}" def flops(self): H, W = self.input_resolution flops = H * W * self.dim flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim return flops class BasicLayer(nn.Module): """ A basic Swin Transformer layer for one stage. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resolution. depth (int): Number of blocks. num_heads (int): Number of attention heads. window_size (int): Local window size. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__(self, dim, input_resolution, depth, num_heads, window_size, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False): super().__init__() self.dim = dim self.input_resolution = input_resolution self.depth = depth self.use_checkpoint = use_checkpoint # build blocks self.blocks = nn.ModuleList([ SwinTransformerBlock(dim=dim, input_resolution=input_resolution, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer) for i in range(depth)]) # patch merging layer if downsample is not None: self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) else: self.downsample = None def forward(self, x, condition=None): for blk in self.blocks: if self.use_checkpoint: x = checkpoint.checkpoint(blk, x) else: x = blk(x, condition=condition) if self.downsample is not None: x = self.downsample(x) return x def extra_repr(self) -> str: return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" def flops(self): flops = 0 for blk in self.blocks: flops += blk.flops() if self.downsample is not None: flops += self.downsample.flops() return flops class PatchEmbed(nn.Module): r""" Image to Patch Embedding Args: img_size (int): Image size. Default: 224. patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] self.img_size = img_size self.patch_size = patch_size self.patches_resolution = patches_resolution self.num_patches = patches_resolution[0] * patches_resolution[1] self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): B, C, H, W = x.shape # FIXME look at relaxing size constraints assert H == self.img_size[0] and W == self.img_size[1], \ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C if self.norm is not None: x = self.norm(x) return x def flops(self): Ho, Wo = self.patches_resolution flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) if self.norm is not None: flops += Ho * Wo * self.embed_dim return flops class SwinTransformer(nn.Module): r""" Swin Transformer A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/pdf/2103.14030 Args: img_size (int | tuple(int)): Input image size. Default 224 patch_size (int | tuple(int)): Patch size. Default: 4 in_chans (int): Number of input image channels. Default: 3 num_classes (int): Number of classes for classification head. Default: 1000 embed_dim (int): Patch embedding dimension. Default: 96 depths (tuple(int)): Depth of each Swin Transformer layer. num_heads (tuple(int)): Number of attention heads in different layers. window_size (int): Window size. Default: 7 mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None drop_rate (float): Dropout rate. Default: 0 attn_drop_rate (float): Attention dropout rate. Default: 0 drop_path_rate (float): Stochastic depth rate. Default: 0.1 norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False patch_norm (bool): If True, add normalization after patch embedding. Default: True use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False """ def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, use_checkpoint=False, **kwargs): super().__init__() self.num_classes = num_classes self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) self.mlp_ratio = mlp_ratio # split image into non-overlapping patches self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None) num_patches = self.patch_embed.num_patches patches_resolution = self.patch_embed.patches_resolution self.patches_resolution = patches_resolution # absolute position embedding if self.ape: self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) trunc_normal_(self.absolute_pos_embed, std=.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer), input_resolution=(patches_resolution[0] // (2 ** i_layer), patches_resolution[1] // (2 ** i_layer)), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint) self.layers.append(layer) self.norm = norm_layer(self.num_features) self.avgpool = nn.AdaptiveAvgPool1d(1) # self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) @torch.jit.ignore def no_weight_decay(self): return {'absolute_pos_embed'} @torch.jit.ignore def no_weight_decay_keywords(self): return {'relative_position_bias_table'} def forward(self, x, idx_to_group_img=None, image_atts=None, condition=None, **kwargs): x = self.patch_embed(x) if self.ape: x = x + self.absolute_pos_embed x = self.pos_drop(x) for layer in self.layers: x = layer(x, condition=condition) x = self.norm(x) # B L C x_cls = self.avgpool(x.transpose(1, 2)) # B C 1 if idx_to_group_img is None: return torch.cat([x_cls.transpose(1, 2), x], dim=1) else: x_bs = torch.gather(x, dim=0, index=idx_to_group_img.view(-1, 1, 1).expand(-1, x.shape[1], x.shape[2])) weights = image_atts[:, 1:].unsqueeze(2) # B L 1 x_bs_cls = torch.sum((weights * x_bs).transpose(1, 2), dim=-1, keepdim=True) # B C 1 x_bs_cls = x_bs_cls / torch.sum(weights.transpose(1, 2), dim=-1, keepdim=True) # avgpool return torch.cat([x_bs_cls.transpose(1, 2), x_bs], dim=1), \ torch.cat([x_cls.transpose(1, 2), x], dim=1) def flops(self): flops = 0 flops += self.patch_embed.flops() for i, layer in enumerate(self.layers): flops += layer.flops() flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) flops += self.num_features * self.num_classes return flops def interpolate_relative_pos_embed(rel_pos_bias, dst_num_pos, param_name=''): # from: https://github.com/microsoft/unilm/blob/8a0a1c1f4e7326938ea7580a00d56d7f17d65612/beit/run_class_finetuning.py#L348 # rel_pos_bias: relative_position_bias_table src_num_pos, num_attn_heads = rel_pos_bias.size() num_extra_tokens = 0 src_size = int((src_num_pos - num_extra_tokens) ** 0.5) dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5) if src_size != dst_size: print("Position interpolate %s from %dx%d to %dx%d" % (param_name, src_size, src_size, dst_size, dst_size)) # extra_tokens = rel_pos_bias[-num_extra_tokens:, :] # rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] def geometric_progression(a, r, n): return a * (1.0 - r ** n) / (1.0 - r) left, right = 1.01, 1.5 while right - left > 1e-6: q = (left + right) / 2.0 gp = geometric_progression(1, q, src_size // 2) if gp > dst_size // 2: right = q else: left = q # if q > 1.090307: # q = 1.090307 dis = [] cur = 1 for i in range(src_size // 2): dis.append(cur) cur += q ** (i + 1) r_ids = [-_ for _ in reversed(dis)] x = r_ids + [0] + dis y = r_ids + [0] + dis t = dst_size // 2.0 dx = np.arange(-t, t + 0.1, 1.0) dy = np.arange(-t, t + 0.1, 1.0) # print("Original positions = %s" % str(x)) # print("Target positions = %s" % str(dx)) all_rel_pos_bias = [] for i in range(num_attn_heads): z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() f = interpolate.interp2d(x, y, z, kind='cubic') all_rel_pos_bias.append( torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device)) rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) return rel_pos_bias def zero_module(module): for p in module.parameters(): nn.init.zeros_(p) return module ================================================ FILE: ram/models/swin_transformer_lora.py ================================================ # -------------------------------------------------------- # Swin Transformer # Copyright (c) 2021 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ze Liu # -------------------------------------------------------- import numpy as np from scipy import interpolate import torch import torch.nn as nn import torch.utils.checkpoint as checkpoint from timm.models.layers import DropPath, to_2tuple, trunc_normal_ import loralib as lora class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) # self.fc1 = lora.Linear(in_features, hidden_features, r=16) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) # self.fc2 = lora.Linear(hidden_features, out_features, r=16) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x def window_partition(x, window_size): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) return windows def window_reverse(windows, window_size, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x class WindowAttention(nn.Module): r""" Window based multi-head self attention (W-MSA) module with relative position bias. It supports both of shifted and non-shifted window. Args: dim (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = torch.arange(self.window_size[0]) coords_w = torch.arange(self.window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) # self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) # lora version self.qkv = lora.MergedLinear(dim, 3*dim, r=8, enable_lora=[True, False, True]) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) trunc_normal_(self.relative_position_bias_table, std=.02) self.softmax = nn.Softmax(dim=-1) def forward(self, x, mask=None): """ Args: x: input features with shape of (num_windows*B, N, C) mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = (q @ k.transpose(-2, -1)) relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B_, N, C) x = self.proj(x) x = self.proj_drop(x) return x def extra_repr(self) -> str: return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}' def flops(self, N): # calculate flops for 1 window with token length of N flops = 0 # qkv = self.qkv(x) flops += N * self.dim * 3 * self.dim # attn = (q @ k.transpose(-2, -1)) flops += self.num_heads * N * (self.dim // self.num_heads) * N # x = (attn @ v) flops += self.num_heads * N * N * (self.dim // self.num_heads) # x = self.proj(x) flops += N * self.dim * self.dim return flops class SwinTransformerBlock(nn.Module): r""" Swin Transformer Block. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resulotion. num_heads (int): Number of attention heads. window_size (int): Window size. shift_size (int): Shift size for SW-MSA. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float, optional): Stochastic depth rate. Default: 0.0 act_layer (nn.Module, optional): Activation layer. Default: nn.GELU norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): super().__init__() self.dim = dim self.input_resolution = input_resolution self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio if min(self.input_resolution) <= self.window_size: # if window size is larger than input resolution, we don't partition windows self.shift_size = 0 self.window_size = min(self.input_resolution) assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) if self.shift_size > 0: # calculate attention mask for SW-MSA H, W = self.input_resolution img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 mask_windows = mask_windows.view(-1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) else: attn_mask = None self.register_buffer("attn_mask", attn_mask) def forward(self, x): H, W = self.input_resolution B, L, C = x.shape assert L == H * W, "input feature has wrong size" shortcut = x x = self.norm1(x) x = x.view(B, H, W, C) # cyclic shift if self.shift_size > 0: shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) else: shifted_x = x # partition windows x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C # reverse cyclic shift if self.shift_size > 0: x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x x = x.view(B, H * W, C) # FFN x = shortcut + self.drop_path(x) x = x + self.drop_path(self.mlp(self.norm2(x))) return x def extra_repr(self) -> str: return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" def flops(self): flops = 0 H, W = self.input_resolution # norm1 flops += self.dim * H * W # W-MSA/SW-MSA nW = H * W / self.window_size / self.window_size flops += nW * self.attn.flops(self.window_size * self.window_size) # mlp flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio # norm2 flops += self.dim * H * W return flops class PatchMerging(nn.Module): r""" Patch Merging Layer. Args: input_resolution (tuple[int]): Resolution of input feature. dim (int): Number of input channels. norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm """ def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): super().__init__() self.input_resolution = input_resolution self.dim = dim self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) self.norm = norm_layer(4 * dim) def forward(self, x): """ x: B, H*W, C """ H, W = self.input_resolution B, L, C = x.shape assert L == H * W, "input feature has wrong size" assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." x = x.view(B, H, W, C) x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C x = self.norm(x) x = self.reduction(x) return x def extra_repr(self) -> str: return f"input_resolution={self.input_resolution}, dim={self.dim}" def flops(self): H, W = self.input_resolution flops = H * W * self.dim flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim return flops class BasicLayer(nn.Module): """ A basic Swin Transformer layer for one stage. Args: dim (int): Number of input channels. input_resolution (tuple[int]): Input resolution. depth (int): Number of blocks. num_heads (int): Number of attention heads. window_size (int): Local window size. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. drop (float, optional): Dropout rate. Default: 0.0 attn_drop (float, optional): Attention dropout rate. Default: 0.0 drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. """ def __init__(self, dim, input_resolution, depth, num_heads, window_size, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False): super().__init__() self.dim = dim self.input_resolution = input_resolution self.depth = depth self.use_checkpoint = use_checkpoint # build blocks self.blocks = nn.ModuleList([ SwinTransformerBlock(dim=dim, input_resolution=input_resolution, num_heads=num_heads, window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop, attn_drop=attn_drop, drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer) for i in range(depth)]) # patch merging layer if downsample is not None: self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) else: self.downsample = None def forward(self, x): for blk in self.blocks: if self.use_checkpoint: x = checkpoint.checkpoint(blk, x) else: x = blk(x) if self.downsample is not None: x = self.downsample(x) return x def extra_repr(self) -> str: return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" def flops(self): flops = 0 for blk in self.blocks: flops += blk.flops() if self.downsample is not None: flops += self.downsample.flops() return flops class PatchEmbed(nn.Module): r""" Image to Patch Embedding Args: img_size (int): Image size. Default: 224. patch_size (int): Patch token size. Default: 4. in_chans (int): Number of input image channels. Default: 3. embed_dim (int): Number of linear projection output channels. Default: 96. norm_layer (nn.Module, optional): Normalization layer. Default: None """ def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] self.img_size = img_size self.patch_size = patch_size self.patches_resolution = patches_resolution self.num_patches = patches_resolution[0] * patches_resolution[1] self.in_chans = in_chans self.embed_dim = embed_dim self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: self.norm = None def forward(self, x): B, C, H, W = x.shape # FIXME look at relaxing size constraints assert H == self.img_size[0] and W == self.img_size[1], \ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C if self.norm is not None: x = self.norm(x) return x def flops(self): Ho, Wo = self.patches_resolution flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) if self.norm is not None: flops += Ho * Wo * self.embed_dim return flops class SwinTransformer(nn.Module): r""" Swin Transformer A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/pdf/2103.14030 Args: img_size (int | tuple(int)): Input image size. Default 224 patch_size (int | tuple(int)): Patch size. Default: 4 in_chans (int): Number of input image channels. Default: 3 num_classes (int): Number of classes for classification head. Default: 1000 embed_dim (int): Patch embedding dimension. Default: 96 depths (tuple(int)): Depth of each Swin Transformer layer. num_heads (tuple(int)): Number of attention heads in different layers. window_size (int): Window size. Default: 7 mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None drop_rate (float): Dropout rate. Default: 0 attn_drop_rate (float): Attention dropout rate. Default: 0 drop_path_rate (float): Stochastic depth rate. Default: 0.1 norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. ape (bool): If True, add absolute position embedding to the patch embedding. Default: False patch_norm (bool): If True, add normalization after patch embedding. Default: True use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False """ def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, use_checkpoint=False, **kwargs): super().__init__() self.num_classes = num_classes self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) self.mlp_ratio = mlp_ratio # split image into non-overlapping patches self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None) num_patches = self.patch_embed.num_patches patches_resolution = self.patch_embed.patches_resolution self.patches_resolution = patches_resolution # absolute position embedding if self.ape: self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) trunc_normal_(self.absolute_pos_embed, std=.02) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() for i_layer in range(self.num_layers): layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer), input_resolution=(patches_resolution[0] // (2 ** i_layer), patches_resolution[1] // (2 ** i_layer)), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, use_checkpoint=use_checkpoint) self.layers.append(layer) self.norm = norm_layer(self.num_features) self.avgpool = nn.AdaptiveAvgPool1d(1) # self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) @torch.jit.ignore def no_weight_decay(self): return {'absolute_pos_embed'} @torch.jit.ignore def no_weight_decay_keywords(self): return {'relative_position_bias_table'} def forward(self, x, idx_to_group_img=None, image_atts=None, **kwargs): x = self.patch_embed(x) if self.ape: x = x + self.absolute_pos_embed x = self.pos_drop(x) for layer in self.layers: x = layer(x) x = self.norm(x) # B L C x_cls = self.avgpool(x.transpose(1, 2)) # B C 1 if idx_to_group_img is None: return torch.cat([x_cls.transpose(1, 2), x], dim=1) else: x_bs = torch.gather(x, dim=0, index=idx_to_group_img.view(-1, 1, 1).expand(-1, x.shape[1], x.shape[2])) weights = image_atts[:, 1:].unsqueeze(2) # B L 1 x_bs_cls = torch.sum((weights * x_bs).transpose(1, 2), dim=-1, keepdim=True) # B C 1 x_bs_cls = x_bs_cls / torch.sum(weights.transpose(1, 2), dim=-1, keepdim=True) # avgpool return torch.cat([x_bs_cls.transpose(1, 2), x_bs], dim=1), \ torch.cat([x_cls.transpose(1, 2), x], dim=1) def flops(self): flops = 0 flops += self.patch_embed.flops() for i, layer in enumerate(self.layers): flops += layer.flops() flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) flops += self.num_features * self.num_classes return flops def interpolate_relative_pos_embed(rel_pos_bias, dst_num_pos, param_name=''): # from: https://github.com/microsoft/unilm/blob/8a0a1c1f4e7326938ea7580a00d56d7f17d65612/beit/run_class_finetuning.py#L348 # rel_pos_bias: relative_position_bias_table src_num_pos, num_attn_heads = rel_pos_bias.size() num_extra_tokens = 0 src_size = int((src_num_pos - num_extra_tokens) ** 0.5) dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5) if src_size != dst_size: print("Position interpolate %s from %dx%d to %dx%d" % (param_name, src_size, src_size, dst_size, dst_size)) # extra_tokens = rel_pos_bias[-num_extra_tokens:, :] # rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] def geometric_progression(a, r, n): return a * (1.0 - r ** n) / (1.0 - r) left, right = 1.01, 1.5 while right - left > 1e-6: q = (left + right) / 2.0 gp = geometric_progression(1, q, src_size // 2) if gp > dst_size // 2: right = q else: left = q # if q > 1.090307: # q = 1.090307 dis = [] cur = 1 for i in range(src_size // 2): dis.append(cur) cur += q ** (i + 1) r_ids = [-_ for _ in reversed(dis)] x = r_ids + [0] + dis y = r_ids + [0] + dis t = dst_size // 2.0 dx = np.arange(-t, t + 0.1, 1.0) dy = np.arange(-t, t + 0.1, 1.0) # print("Original positions = %s" % str(x)) # print("Target positions = %s" % str(dx)) all_rel_pos_bias = [] for i in range(num_attn_heads): z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() f = interpolate.interp2d(x, y, z, kind='cubic') all_rel_pos_bias.append( torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device)) rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) return rel_pos_bias ================================================ FILE: ram/models/tag2text.py ================================================ ''' * The Tag2Text Model * Written by Xinyu Huang ''' import numpy as np import json import torch import warnings from torch import nn from .bert import BertConfig, BertModel, BertLMHeadModel from .swin_transformer import SwinTransformer from .utils import * warnings.filterwarnings("ignore") class Tag2Text(nn.Module): def __init__(self, med_config=f'{CONFIG_PATH}/configs/med_config.json', image_size=384, vit='base', vit_grad_ckpt=False, vit_ckpt_layer=0, prompt='a picture of ', threshold=0.68, delete_tag_index=[127,2961, 3351, 3265, 3338, 3355, 3359], tag_list=f'{CONFIG_PATH}/data/tag_list.txt'): r""" Tag2Text inference module, both captioning and tagging are included. Tag2Text is an efficient and controllable vision-language pre-training framework. Described in the paper "Tag2Text: Guiding Vision-Language Model via Image Tagging" https://arxiv.org/abs/2303.05657 Args: med_config (str): path for the mixture of encoder-decoder model's configuration file image_size (int): input image size vit (str): model size of vision transformer threshold (int): tagging threshold delete_tag_index (list): delete some tags that may disturb captioning """ super().__init__() # create image encoder if vit == 'swin_b': if image_size == 224: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_224.json' elif image_size == 384: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_384.json' vision_config = read_json(vision_config_path) assert image_size == vision_config['image_res'] # assert config['patch_size'] == 32 vision_width = vision_config['vision_width'] self.visual_encoder = SwinTransformer( img_size=vision_config['image_res'], patch_size=4, in_chans=3, embed_dim=vision_config['embed_dim'], depths=vision_config['depths'], num_heads=vision_config['num_heads'], window_size=vision_config['window_size'], mlp_ratio=4., qkv_bias=True, drop_rate=0.0, drop_path_rate=0.1, ape=False, patch_norm=True, use_checkpoint=False) else: self.visual_encoder, vision_width = create_vit( vit, image_size, vit_grad_ckpt, vit_ckpt_layer) # create tokenzier self.tokenizer = init_tokenizer() # Tag2Text employ encoder-decoder architecture for image-tag-text generation: image-tag interaction encoder and image-tag-text decoder # create image-tag interaction encoder encoder_config = BertConfig.from_json_file(med_config) encoder_config.encoder_width = vision_width self.tag_encoder = BertModel(config=encoder_config, add_pooling_layer=False) # create image-tag-text decoder decoder_config = BertConfig.from_json_file(med_config) self.text_decoder = BertLMHeadModel(config=decoder_config) # delete some tags that may disturb captioning # 127: "quarter"; 2961: "back"; 3351: "two"; 3265: "three"; 3338: "four"; 3355: "five"; 3359: "one" self.delete_tag_index = delete_tag_index self.prompt = prompt self.prompt_length = len(self.tokenizer(self.prompt).input_ids) - 1 # load tag list self.tag_list = self.load_tag_list(tag_list) # create image-tag recognition decoder self.threshold = threshold self.num_class = len(self.tag_list) q2l_config = BertConfig.from_json_file(f'{CONFIG_PATH}/configs/q2l_config.json') q2l_config.encoder_width = vision_width self.tagging_head = BertModel(config=q2l_config, add_pooling_layer=False) self.tagging_head.resize_token_embeddings(len(self.tokenizer)) self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size) self.fc = GroupWiseLinear(self.num_class, q2l_config.hidden_size, bias=True) self.del_selfattention() self.tagging_loss_function = AsymmetricLoss(gamma_neg=7, gamma_pos=0, clip=0.05) # share weights of the lowest 2-layer of "image-tag interaction encoder" with the "image-tag recogntion decoder" tie_encoder_decoder_weights(self.tag_encoder, self.tagging_head, '', ' ') # adjust thresholds for some tags # default threshold: 0.68 # 2701: "person"; 2828: "man"; 1167: "woman"; tag_thrshold = {2701:0.7, 2828: 0.7, 1167: 0.7} self.class_threshold = torch.ones(self.num_class) * self.threshold for key,value in tag_thrshold.items(): self.class_threshold[key] = value def load_tag_list(self, tag_list_file): with open(tag_list_file, 'r') as f: tag_list = f.read().splitlines() tag_list = np.array(tag_list) return tag_list # delete self-attention layer of image-tag recognition decoder to reduce computation, follower Query2Label def del_selfattention(self): del self.tagging_head.embeddings for layer in self.tagging_head.encoder.layer: del layer.attention def forward(self, image, caption, tag): """ call function as forward Args: image: type: torch.Tensor shape: batch_size * 3 * 384 * 384 caption: type: list[string] len: batch_size tag: type: torch.Tensor shape: batch * class_num (e.g. 3429) value: positive sample is 1.0, negative sample is 0.0 Returns: loss: type: torch.Tensor """ image_embeds = self.visual_encoder(image) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) ##================= Image Tagging ================## bs = image_embeds.shape[0] label_embed = self.label_embed.weight.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]) loss_tag = self.tagging_loss_function(logits, tag) ##================= Image-Tag-Text Generation ================## tag = tag.cpu().numpy() tag_input = [] for b in range(bs): index = np.argwhere(tag[b] == 1) token = self.tag_list[index].squeeze(axis=1) tag_input.append(' | '.join(token)) # tokenizer input tags tag_input_tokenzier = self.tokenizer(tag_input, padding='max_length', truncation=True, max_length=40, return_tensors="pt").to( image.device) encoder_input_ids = tag_input_tokenzier.input_ids encoder_input_ids[:, 0] = self.tokenizer.enc_token_id # put input tag into image-tag interaction encoder to interact with image embeddings output_tagembedding = self.tag_encoder( encoder_input_ids, attention_mask=tag_input_tokenzier.attention_mask, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=True, ) text = self.tokenizer(caption, padding='longest', truncation=True, max_length=40, return_tensors="pt").to( image.device) decoder_input_ids = text.input_ids decoder_input_ids[:,0] = self.tokenizer.bos_token_id decoder_targets = decoder_input_ids.masked_fill( decoder_input_ids == self.tokenizer.pad_token_id, -100) decoder_targets[:,:self.prompt_length] = -100 decoder_output = self.text_decoder(decoder_input_ids, attention_mask = text.attention_mask, encoder_hidden_states = output_tagembedding.last_hidden_state, encoder_attention_mask = None, labels = decoder_targets, return_dict = True, ) loss_t2t = decoder_output.loss # balance loss scale loss = loss_t2t + loss_tag/(loss_tag/loss_t2t).detach() return loss def generate_image_embeds(self, image, condition=False ): image_embeds = self.visual_encoder(image) return image_embeds def condition_forward(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0, tag_input=None, return_tag_predict=False): image_embeds = self.visual_encoder(image) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # if not user specified tags, recognized image tags using image-tag recogntiion decoder bs = image_embeds.shape[0] label_embed = self.label_embed.weight.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]) targets = torch.where( torch.sigmoid(logits) > self.class_threshold.to(image.device), torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device)) # delete some tags that may disturb captioning targets[:, self.delete_tag_index] = 0 return image_embeds, logits, targets def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0, tag_input=None, return_tag_predict=False): image_embeds = self.visual_encoder(image) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # if not user specified tags, recognized image tags using image-tag recogntiion decoder if tag_input == None: bs = image_embeds.shape[0] label_embed = self.label_embed.weight.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]) targets = torch.where( torch.sigmoid(logits) > self.class_threshold.to(image.device), torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device)) tag = targets.cpu().numpy() # delete some tags that may disturb captioning tag[:, self.delete_tag_index] = 0 tag_input = [] for b in range(bs): index = np.argwhere(tag[b] == 1) token = self.tag_list[index].squeeze(axis=1) tag_input.append(', '.join(token)) tag_output = tag_input # beam search for text generation(default) if not sample: image_embeds = image_embeds.repeat_interleave(num_beams, dim=0) tag_input_temp = [] for tag in tag_input: for i in range(num_beams): tag_input_temp.append(tag) tag_input = tag_input_temp image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # tokenizer input tags tag_input_tokenzier = self.tokenizer(tag_input, padding='max_length', truncation=True, max_length=40, return_tensors="pt").to( image.device) encoder_input_ids = tag_input_tokenzier.input_ids encoder_input_ids[:, 0] = self.tokenizer.enc_token_id # put input tag into image-tag interaction encoder to interact with image embeddings output_tagembedding = self.tag_encoder( encoder_input_ids, attention_mask=tag_input_tokenzier.attention_mask, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=True, ) # prompt trick for better captioning, followed BLIP prompt = [self.prompt] * image.size(0) input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to( image.device) input_ids[:, 0] = self.tokenizer.bos_token_id input_ids = input_ids[:, :-1] if sample: # nucleus sampling model_kwargs = { "encoder_hidden_states": output_tagembedding.last_hidden_state, "encoder_attention_mask": None } outputs = self.text_decoder.generate( input_ids=input_ids, max_length=max_length, min_length=min_length, do_sample=True, top_p=top_p, num_return_sequences=1, eos_token_id=self.tokenizer.sep_token_id, pad_token_id=self.tokenizer.pad_token_id, repetition_penalty=1.1, **model_kwargs) else: # beam search (default) model_kwargs = { "encoder_hidden_states": output_tagembedding.last_hidden_state, "encoder_attention_mask": None } outputs = self.text_decoder.generate( input_ids=input_ids, max_length=max_length, min_length=min_length, num_beams=num_beams, eos_token_id=self.tokenizer.sep_token_id, pad_token_id=self.tokenizer.pad_token_id, repetition_penalty=repetition_penalty, **model_kwargs) captions = [] for output in outputs: caption = self.tokenizer.decode(output, skip_special_tokens=True) captions.append(caption[len(self.prompt):]) if return_tag_predict == True: return captions, tag_output return captions # load Tag2Text pretrained model parameters def tag2text(pretrained='', **kwargs): model = Tag2Text(**kwargs) if pretrained: if kwargs['vit'] == 'swin_b': model, msg = load_checkpoint_swinbase(model, pretrained, kwargs) else: model, msg = load_checkpoint(model, pretrained) print('vit:', kwargs['vit']) # print('msg', msg) return model ================================================ FILE: ram/models/tag2text_lora.py ================================================ ''' * The Tag2Text Model * Written by Xinyu Huang ''' import numpy as np import json import torch import warnings from torch import nn from .bert_lora import BertConfig, BertModel, BertLMHeadModel from .swin_transformer_lora import SwinTransformer from .utils import * warnings.filterwarnings("ignore") class Tag2Text(nn.Module): def __init__(self, med_config=f'{CONFIG_PATH}/configs/med_config.json', image_size=384, vit='base', vit_grad_ckpt=False, vit_ckpt_layer=0, prompt='a picture of ', threshold=0.68, delete_tag_index=[127,2961, 3351, 3265, 3338, 3355, 3359], tag_list=f'{CONFIG_PATH}/data/tag_list.txt'): r""" Tag2Text inference module, both captioning and tagging are included. Tag2Text is an efficient and controllable vision-language pre-training framework. Described in the paper "Tag2Text: Guiding Vision-Language Model via Image Tagging" https://arxiv.org/abs/2303.05657 Args: med_config (str): path for the mixture of encoder-decoder model's configuration file image_size (int): input image size vit (str): model size of vision transformer threshold (int): tagging threshold delete_tag_index (list): delete some tags that may disturb captioning """ super().__init__() # create image encoder if vit == 'swin_b': if image_size == 224: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_224.json' elif image_size == 384: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_384.json' vision_config = read_json(vision_config_path) assert image_size == vision_config['image_res'] # assert config['patch_size'] == 32 vision_width = vision_config['vision_width'] self.visual_encoder = SwinTransformer( img_size=vision_config['image_res'], patch_size=4, in_chans=3, embed_dim=vision_config['embed_dim'], depths=vision_config['depths'], num_heads=vision_config['num_heads'], window_size=vision_config['window_size'], mlp_ratio=4., qkv_bias=True, drop_rate=0.0, drop_path_rate=0.1, ape=False, patch_norm=True, use_checkpoint=False) else: self.visual_encoder, vision_width = create_vit( vit, image_size, vit_grad_ckpt, vit_ckpt_layer) # create tokenzier self.tokenizer = init_tokenizer() # Tag2Text employ encoder-decoder architecture for image-tag-text generation: image-tag interaction encoder and image-tag-text decoder # create image-tag interaction encoder encoder_config = BertConfig.from_json_file(med_config) encoder_config.encoder_width = vision_width self.tag_encoder = BertModel(config=encoder_config, add_pooling_layer=False) # create image-tag-text decoder decoder_config = BertConfig.from_json_file(med_config) self.text_decoder = BertLMHeadModel(config=decoder_config) # delete some tags that may disturb captioning # 127: "quarter"; 2961: "back"; 3351: "two"; 3265: "three"; 3338: "four"; 3355: "five"; 3359: "one" self.delete_tag_index = delete_tag_index self.prompt = prompt self.prompt_length = len(self.tokenizer(self.prompt).input_ids) - 1 # load tag list self.tag_list = self.load_tag_list(tag_list) # create image-tag recognition decoder self.threshold = threshold self.num_class = len(self.tag_list) q2l_config = BertConfig.from_json_file(f'{CONFIG_PATH}/configs/q2l_config.json') q2l_config.encoder_width = vision_width self.tagging_head = BertModel(config=q2l_config, add_pooling_layer=False) self.tagging_head.resize_token_embeddings(len(self.tokenizer)) self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size) self.fc = GroupWiseLinear(self.num_class, q2l_config.hidden_size, bias=True) self.del_selfattention() self.tagging_loss_function = AsymmetricLoss(gamma_neg=7, gamma_pos=0, clip=0.05) # share weights of the lowest 2-layer of "image-tag interaction encoder" with the "image-tag recogntion decoder" tie_encoder_decoder_weights(self.tag_encoder, self.tagging_head, '', ' ') # adjust thresholds for some tags # default threshold: 0.68 # 2701: "person"; 2828: "man"; 1167: "woman"; tag_thrshold = {2701:0.7, 2828: 0.7, 1167: 0.7} self.class_threshold = torch.ones(self.num_class) * self.threshold for key,value in tag_thrshold.items(): self.class_threshold[key] = value def load_tag_list(self, tag_list_file): with open(tag_list_file, 'r') as f: tag_list = f.read().splitlines() tag_list = np.array(tag_list) return tag_list # delete self-attention layer of image-tag recognition decoder to reduce computation, follower Query2Label def del_selfattention(self): del self.tagging_head.embeddings for layer in self.tagging_head.encoder.layer: del layer.attention def forward(self, image, caption, tag): """ call function as forward Args: image: type: torch.Tensor shape: batch_size * 3 * 384 * 384 caption: type: list[string] len: batch_size tag: type: torch.Tensor shape: batch * class_num (e.g. 3429) value: positive sample is 1.0, negative sample is 0.0 Returns: loss: type: torch.Tensor """ image_embeds = self.visual_encoder(image) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) ##================= Image Tagging ================## bs = image_embeds.shape[0] label_embed = self.label_embed.weight.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]) loss_tag = self.tagging_loss_function(logits, tag) ##================= Image-Tag-Text Generation ================## tag = tag.cpu().numpy() tag_input = [] for b in range(bs): index = np.argwhere(tag[b] == 1) token = self.tag_list[index].squeeze(axis=1) tag_input.append(' | '.join(token)) # tokenizer input tags tag_input_tokenzier = self.tokenizer(tag_input, padding='max_length', truncation=True, max_length=40, return_tensors="pt").to( image.device) encoder_input_ids = tag_input_tokenzier.input_ids encoder_input_ids[:, 0] = self.tokenizer.enc_token_id # put input tag into image-tag interaction encoder to interact with image embeddings output_tagembedding = self.tag_encoder( encoder_input_ids, attention_mask=tag_input_tokenzier.attention_mask, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=True, ) text = self.tokenizer(caption, padding='longest', truncation=True, max_length=40, return_tensors="pt").to( image.device) decoder_input_ids = text.input_ids decoder_input_ids[:,0] = self.tokenizer.bos_token_id decoder_targets = decoder_input_ids.masked_fill( decoder_input_ids == self.tokenizer.pad_token_id, -100) decoder_targets[:,:self.prompt_length] = -100 decoder_output = self.text_decoder(decoder_input_ids, attention_mask = text.attention_mask, encoder_hidden_states = output_tagembedding.last_hidden_state, encoder_attention_mask = None, labels = decoder_targets, return_dict = True, ) loss_t2t = decoder_output.loss # balance loss scale loss = loss_t2t + loss_tag/(loss_tag/loss_t2t).detach() return loss def generate_image_embeds(self, image, condition=False ): image_embeds = self.visual_encoder(image) return image_embeds def condition_forward(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0, tag_input=None, return_tag_predict=False): image_embeds = self.visual_encoder(image) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # if not user specified tags, recognized image tags using image-tag recogntiion decoder bs = image_embeds.shape[0] label_embed = self.label_embed.weight.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]) targets = torch.where( torch.sigmoid(logits) > self.class_threshold.to(image.device), torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device)) # delete some tags that may disturb captioning targets[:, self.delete_tag_index] = 0 return image_embeds, logits, targets def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0, tag_input=None, return_tag_predict=False): image_embeds = self.visual_encoder(image) image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # if not user specified tags, recognized image tags using image-tag recogntiion decoder if tag_input == None: bs = image_embeds.shape[0] label_embed = self.label_embed.weight.unsqueeze(0).repeat(bs, 1, 1) tagging_embed = self.tagging_head( encoder_embeds=label_embed, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=False, mode='tagging', ) logits = self.fc(tagging_embed[0]) targets = torch.where( torch.sigmoid(logits) > self.class_threshold.to(image.device), torch.tensor(1.0).to(image.device), torch.zeros(self.num_class).to(image.device)) tag = targets.cpu().numpy() # delete some tags that may disturb captioning tag[:, self.delete_tag_index] = 0 tag_input = [] for b in range(bs): index = np.argwhere(tag[b] == 1) token = self.tag_list[index].squeeze(axis=1) tag_input.append(', '.join(token)) tag_output = tag_input # beam search for text generation(default) if not sample: image_embeds = image_embeds.repeat_interleave(num_beams, dim=0) tag_input_temp = [] for tag in tag_input: for i in range(num_beams): tag_input_temp.append(tag) tag_input = tag_input_temp image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) # tokenizer input tags tag_input_tokenzier = self.tokenizer(tag_input, padding='max_length', truncation=True, max_length=40, return_tensors="pt").to( image.device) encoder_input_ids = tag_input_tokenzier.input_ids encoder_input_ids[:, 0] = self.tokenizer.enc_token_id # put input tag into image-tag interaction encoder to interact with image embeddings output_tagembedding = self.tag_encoder( encoder_input_ids, attention_mask=tag_input_tokenzier.attention_mask, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, return_dict=True, ) # prompt trick for better captioning, followed BLIP prompt = [self.prompt] * image.size(0) input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to( image.device) input_ids[:, 0] = self.tokenizer.bos_token_id input_ids = input_ids[:, :-1] if sample: # nucleus sampling model_kwargs = { "encoder_hidden_states": output_tagembedding.last_hidden_state, "encoder_attention_mask": None } outputs = self.text_decoder.generate( input_ids=input_ids, max_length=max_length, min_length=min_length, do_sample=True, top_p=top_p, num_return_sequences=1, eos_token_id=self.tokenizer.sep_token_id, pad_token_id=self.tokenizer.pad_token_id, repetition_penalty=1.1, **model_kwargs) else: # beam search (default) model_kwargs = { "encoder_hidden_states": output_tagembedding.last_hidden_state, "encoder_attention_mask": None } outputs = self.text_decoder.generate( input_ids=input_ids, max_length=max_length, min_length=min_length, num_beams=num_beams, eos_token_id=self.tokenizer.sep_token_id, pad_token_id=self.tokenizer.pad_token_id, repetition_penalty=repetition_penalty, **model_kwargs) captions = [] for output in outputs: caption = self.tokenizer.decode(output, skip_special_tokens=True) captions.append(caption[len(self.prompt):]) if return_tag_predict == True: return captions, tag_output return captions # load Tag2Text pretrained model parameters def tag2text(pretrained='', **kwargs): model = Tag2Text(**kwargs) if pretrained: if kwargs['vit'] == 'swin_b': model, msg = load_checkpoint_swinbase(model, pretrained, kwargs) else: model, msg = load_checkpoint(model, pretrained) print('vit:', kwargs['vit']) # print('msg', msg) return model ================================================ FILE: ram/models/utils.py ================================================ import os import json import torch import math from torch import nn from typing import List from transformers import BertTokenizer from urllib.parse import urlparse from timm.models.hub import download_cached_file from .vit import interpolate_pos_embed from .swin_transformer import interpolate_relative_pos_embed from pathlib import Path CONFIG_PATH=(Path(__file__).resolve().parents[1]) def read_json(rpath): with open(rpath, 'r') as f: return json.load(f) def tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str, skip_key: str): uninitialized_encoder_weights: List[str] = [] if decoder.__class__ != encoder.__class__: logger.info( f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized." ) def tie_encoder_to_decoder_recursively( decoder_pointer: nn.Module, encoder_pointer: nn.Module, module_name: str, uninitialized_encoder_weights: List[str], skip_key: str, depth=0, ): assert isinstance(decoder_pointer, nn.Module) and isinstance( encoder_pointer, nn.Module ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module" if hasattr(decoder_pointer, "weight") and skip_key not in module_name: assert hasattr(encoder_pointer, "weight") encoder_pointer.weight = decoder_pointer.weight if hasattr(decoder_pointer, "bias"): assert hasattr(encoder_pointer, "bias") encoder_pointer.bias = decoder_pointer.bias print(module_name + ' is tied') return encoder_modules = encoder_pointer._modules decoder_modules = decoder_pointer._modules if len(decoder_modules) > 0: assert ( len(encoder_modules) > 0 ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}" all_encoder_weights = set([ module_name + "/" + sub_name for sub_name in encoder_modules.keys() ]) encoder_layer_pos = 0 for name, module in decoder_modules.items(): if name.isdigit(): encoder_name = str(int(name) + encoder_layer_pos) decoder_name = name if not isinstance( decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len( encoder_modules) != len(decoder_modules): # this can happen if the name corresponds to the position in a list module list of layers # in this case the decoder has added a cross-attention that the encoder does not have # thus skip this step and subtract one layer pos from encoder encoder_layer_pos -= 1 continue elif name not in encoder_modules: continue elif depth > 500: raise ValueError( "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model." ) else: decoder_name = encoder_name = name tie_encoder_to_decoder_recursively( decoder_modules[decoder_name], encoder_modules[encoder_name], module_name + "/" + name, uninitialized_encoder_weights, skip_key, depth=depth + 1, ) all_encoder_weights.remove(module_name + "/" + encoder_name) uninitialized_encoder_weights += list(all_encoder_weights) # tie weights recursively tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights, skip_key) class GroupWiseLinear(nn.Module): # could be changed to: # output = torch.einsum('ijk,zjk->ij', x, self.W) # or output = torch.einsum('ijk,jk->ij', x, self.W[0]) def __init__(self, num_class, hidden_dim, bias=True): super().__init__() self.num_class = num_class self.hidden_dim = hidden_dim self.bias = bias self.W = nn.Parameter(torch.Tensor(1, num_class, hidden_dim)) if bias: self.b = nn.Parameter(torch.Tensor(1, num_class)) self.reset_parameters() def reset_parameters(self): stdv = 1. / math.sqrt(self.W.size(2)) for i in range(self.num_class): self.W[0][i].data.uniform_(-stdv, stdv) if self.bias: for i in range(self.num_class): self.b[0][i].data.uniform_(-stdv, stdv) def forward(self, x): # x: B,K,d x = (self.W * x).sum(-1) if self.bias: x = x + self.b return x def init_tokenizer(): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') tokenizer.add_special_tokens({'bos_token': '[DEC]'}) tokenizer.add_special_tokens({'additional_special_tokens': ['[ENC]']}) tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0] return tokenizer def create_vit(vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop_path_rate=0): assert vit in ['base', 'large'], "vit parameter must be base or large" if vit == 'base': vision_width = 768 visual_encoder = VisionTransformer( img_size=image_size, patch_size=16, embed_dim=vision_width, depth=12, num_heads=12, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer, drop_path_rate=0 or drop_path_rate) elif vit == 'large': vision_width = 1024 visual_encoder = VisionTransformer( img_size=image_size, patch_size=16, embed_dim=vision_width, depth=24, num_heads=16, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer, drop_path_rate=0.1 or drop_path_rate) return visual_encoder, vision_width def is_url(url_or_filename): parsed = urlparse(url_or_filename) return parsed.scheme in ("http", "https") def load_checkpoint(model, url_or_filename): if is_url(url_or_filename): cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True) checkpoint = torch.load(cached_file, map_location='cpu') elif os.path.isfile(url_or_filename): checkpoint = torch.load(url_or_filename, map_location='cpu') else: raise RuntimeError('checkpoint url or path is invalid') state_dict = checkpoint['model'] state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed( state_dict['visual_encoder.pos_embed'], model.visual_encoder) if 'visual_encoder_m.pos_embed' in model.state_dict().keys(): state_dict['visual_encoder_m.pos_embed'] = interpolate_pos_embed( state_dict['visual_encoder_m.pos_embed'], model.visual_encoder_m) for key in model.state_dict().keys(): if key in state_dict.keys(): if state_dict[key].shape != model.state_dict()[key].shape: del state_dict[key] msg = model.load_state_dict(state_dict, strict=False) print('load checkpoint from %s' % url_or_filename) return model, msg # def load_checkpoint_condition(model, url_or_filename): def load_checkpoint_swinlarge_condition(model, url_or_filename, kwargs): if kwargs['image_size'] == 224: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_224.json' elif kwargs['image_size'] == 384: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_384.json' window_size = read_json(vision_config_path)['window_size'] print('--------------') print(url_or_filename) print('--------------') if is_url(url_or_filename): cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True) checkpoint = torch.load(cached_file, map_location='cpu') elif os.path.isfile(url_or_filename): checkpoint = torch.load(url_or_filename, map_location='cpu') else: raise RuntimeError('checkpoint url or path is invalid') state_dict = checkpoint['params'] for k in list(state_dict.keys()): if 'relative_position_bias_table' in k: dst_num_pos = (2 * window_size - 1)**2 state_dict[k] = interpolate_relative_pos_embed(state_dict[k], dst_num_pos, param_name=k) elif ('relative_position_index' in k) or ('attn_mask' in k): del state_dict[k] elif "vision_multi" in k: state_dict[k.replace("vision_multi", "tagging_head")] = state_dict.pop(k) msg = model.load_state_dict(state_dict, strict=False) print('load checkpoint from %s' % url_or_filename) return model, msg def load_checkpoint_swinbase(model, url_or_filename, kwargs): if kwargs['image_size'] == 224: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_224.json' elif kwargs['image_size'] == 384: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_384.json' window_size = read_json(vision_config_path)['window_size'] print('--------------') print(url_or_filename) print('--------------') if is_url(url_or_filename): cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True) checkpoint = torch.load(cached_file, map_location='cpu') elif os.path.isfile(url_or_filename): checkpoint = torch.load(url_or_filename, map_location='cpu') else: raise RuntimeError('checkpoint url or path is invalid') state_dict = checkpoint['model'] for k in list(state_dict.keys()): if 'relative_position_bias_table' in k: dst_num_pos = (2 * window_size - 1)**2 state_dict[k] = interpolate_relative_pos_embed(state_dict[k], dst_num_pos, param_name=k) elif ('relative_position_index' in k) or ('attn_mask' in k): del state_dict[k] elif "vision_multi" in k: state_dict[k.replace("vision_multi", "tagging_head")] = state_dict.pop(k) msg = model.load_state_dict(state_dict, strict=False) print('load checkpoint from %s' % url_or_filename) return model, msg def load_checkpoint_swinlarge(model, url_or_filename, kwargs): if kwargs['image_size'] == 224: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_224.json' elif kwargs['image_size'] == 384: vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_384.json' window_size = read_json(vision_config_path)['window_size'] print('--------------') print(url_or_filename) print('--------------') if is_url(url_or_filename): cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True) checkpoint = torch.load(cached_file, map_location='cpu') elif os.path.isfile(url_or_filename): checkpoint = torch.load(url_or_filename, map_location='cpu') else: raise RuntimeError('checkpoint url or path is invalid') state_dict = checkpoint['model'] for k in list(state_dict.keys()): if 'relative_position_bias_table' in k: dst_num_pos = (2 * window_size - 1)**2 state_dict[k] = interpolate_relative_pos_embed(state_dict[k], dst_num_pos, param_name=k) elif ('relative_position_index' in k) or ('attn_mask' in k): del state_dict[k] elif "vision_multi" in k: state_dict[k.replace("vision_multi", "tagging_head")] = state_dict.pop(k) msg = model.load_state_dict(state_dict, strict=False) print('load checkpoint from %s' % url_or_filename) return model, msg # Tagging loss function # copy from https://github.com/Alibaba-MIIL/ASL/blob/main/src/loss_functions/losses.py class AsymmetricLoss(nn.Module): def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=True): super(AsymmetricLoss, self).__init__() self.gamma_neg = gamma_neg self.gamma_pos = gamma_pos self.clip = clip self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss self.eps = eps def forward(self, x, y): """" Parameters ---------- x: input logits y: targets (multi-label binarized vector) """ # Calculating Probabilities x_sigmoid = torch.sigmoid(x) xs_pos = x_sigmoid xs_neg = 1 - x_sigmoid # Asymmetric Clipping if self.clip is not None and self.clip > 0: xs_neg = (xs_neg + self.clip).clamp(max=1) # Basic CE calculation los_pos = y * torch.log(xs_pos.clamp(min=self.eps)) los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps)) loss = los_pos + los_neg # Asymmetric Focusing if self.gamma_neg > 0 or self.gamma_pos > 0: if self.disable_torch_grad_focal_loss: torch.set_grad_enabled(False) pt0 = xs_pos * y pt1 = xs_neg * (1 - y) # pt = p if t > 0 else 1-p pt = pt0 + pt1 one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y) one_sided_w = torch.pow(1 - pt, one_sided_gamma) if self.disable_torch_grad_focal_loss: torch.set_grad_enabled(True) loss *= one_sided_w return -loss.sum() ================================================ FILE: ram/models/vit.py ================================================ ''' * Copyright (c) 2022, salesforce.com, inc. * All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause * By Junnan Li * Based on timm code base * https://github.com/rwightman/pytorch-image-models/tree/master/timm ''' import torch import torch.nn as nn import torch.nn.functional as F from functools import partial from timm.models.vision_transformer import _cfg, PatchEmbed from timm.models.registry import register_model from timm.models.layers import trunc_normal_, DropPath from timm.models.helpers import named_apply, adapt_input_conv from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper class Mlp(nn.Module): """ MLP as used in Vision Transformer, MLP-Mixer and related networks """ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop(x) x = self.fc2(x) x = self.drop(x) return x class Attention(nn.Module): def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights self.scale = qk_scale or head_dim ** -0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.attn_gradients = None self.attention_map = None def save_attn_gradients(self, attn_gradients): self.attn_gradients = attn_gradients def get_attn_gradients(self): return self.attn_gradients def save_attention_map(self, attention_map): self.attention_map = attention_map def get_attention_map(self): return self.attention_map def forward(self, x, register_hook=False): B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) attn = (q @ k.transpose(-2, -1)) * self.scale attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) if register_hook: self.save_attention_map(attn) attn.register_hook(self.save_attn_gradients) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x class Block(nn.Module): def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_grad_checkpointing=False): super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) if use_grad_checkpointing: self.attn = checkpoint_wrapper(self.attn) self.mlp = checkpoint_wrapper(self.mlp) def forward(self, x, register_hook=False): x = x + self.drop_path(self.attn(self.norm1(x), register_hook=register_hook)) x = x + self.drop_path(self.mlp(self.norm2(x))) return x class VisionTransformer(nn.Module): """ Vision Transformer A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - https://arxiv.org/abs/2010.11929 """ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=None, use_grad_checkpointing=False, ckpt_layer=0): """ Args: img_size (int, tuple): input image size patch_size (int, tuple): patch size in_chans (int): number of input channels num_classes (int): number of classes for classification head embed_dim (int): embedding dimension depth (int): depth of transformer num_heads (int): number of attention heads mlp_ratio (int): ratio of mlp hidden dim to embedding dim qkv_bias (bool): enable bias for qkv if True qk_scale (float): override default qk scale of head_dim ** -0.5 if set representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set drop_rate (float): dropout rate attn_drop_rate (float): attention dropout rate drop_path_rate (float): stochastic depth rate norm_layer: (nn.Module): normalization layer """ super().__init__() self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) num_patches = self.patch_embed.num_patches self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) self.pos_drop = nn.Dropout(p=drop_rate) dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule self.blocks = nn.ModuleList([ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, use_grad_checkpointing=(use_grad_checkpointing and i>=depth-ckpt_layer) ) for i in range(depth)]) self.norm = norm_layer(embed_dim) trunc_normal_(self.pos_embed, std=.02) trunc_normal_(self.cls_token, std=.02) self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) @torch.jit.ignore def no_weight_decay(self): return {'pos_embed', 'cls_token'} def forward(self, x, register_blk=-1): B = x.shape[0] x = self.patch_embed(x) cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks x = torch.cat((cls_tokens, x), dim=1) x = x + self.pos_embed[:,:x.size(1),:] x = self.pos_drop(x) for i,blk in enumerate(self.blocks): x = blk(x, register_blk==i) x = self.norm(x) return x @torch.jit.ignore() def load_pretrained(self, checkpoint_path, prefix=''): _load_weights(self, checkpoint_path, prefix) @torch.no_grad() def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ''): """ Load weights from .npz checkpoints for official Google Brain Flax implementation """ import numpy as np def _n2p(w, t=True): if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1: w = w.flatten() if t: if w.ndim == 4: w = w.transpose([3, 2, 0, 1]) elif w.ndim == 3: w = w.transpose([2, 0, 1]) elif w.ndim == 2: w = w.transpose([1, 0]) return torch.from_numpy(w) w = np.load(checkpoint_path) if not prefix and 'opt/target/embedding/kernel' in w: prefix = 'opt/target/' if hasattr(model.patch_embed, 'backbone'): # hybrid backbone = model.patch_embed.backbone stem_only = not hasattr(backbone, 'stem') stem = backbone if stem_only else backbone.stem stem.conv.weight.copy_(adapt_input_conv(stem.conv.weight.shape[1], _n2p(w[f'{prefix}conv_root/kernel']))) stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale'])) stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias'])) if not stem_only: for i, stage in enumerate(backbone.stages): for j, block in enumerate(stage.blocks): bp = f'{prefix}block{i + 1}/unit{j + 1}/' for r in range(3): getattr(block, f'conv{r + 1}').weight.copy_(_n2p(w[f'{bp}conv{r + 1}/kernel'])) getattr(block, f'norm{r + 1}').weight.copy_(_n2p(w[f'{bp}gn{r + 1}/scale'])) getattr(block, f'norm{r + 1}').bias.copy_(_n2p(w[f'{bp}gn{r + 1}/bias'])) if block.downsample is not None: block.downsample.conv.weight.copy_(_n2p(w[f'{bp}conv_proj/kernel'])) block.downsample.norm.weight.copy_(_n2p(w[f'{bp}gn_proj/scale'])) block.downsample.norm.bias.copy_(_n2p(w[f'{bp}gn_proj/bias'])) embed_conv_w = _n2p(w[f'{prefix}embedding/kernel']) else: embed_conv_w = adapt_input_conv( model.patch_embed.proj.weight.shape[1], _n2p(w[f'{prefix}embedding/kernel'])) model.patch_embed.proj.weight.copy_(embed_conv_w) model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias'])) model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False)) pos_embed_w = _n2p(w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False) if pos_embed_w.shape != model.pos_embed.shape: pos_embed_w = resize_pos_embed( # resize pos embedding when different size from pretrained weights pos_embed_w, model.pos_embed, getattr(model, 'num_tokens', 1), model.patch_embed.grid_size) model.pos_embed.copy_(pos_embed_w) model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale'])) model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias'])) # if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]: # model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel'])) # model.head.bias.copy_(_n2p(w[f'{prefix}head/bias'])) # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w: # model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel'])) # model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias'])) for i, block in enumerate(model.blocks.children()): block_prefix = f'{prefix}Transformer/encoderblock_{i}/' mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/' block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'])) block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'])) block.attn.qkv.weight.copy_(torch.cat([ _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')])) block.attn.qkv.bias.copy_(torch.cat([ _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')])) block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1)) block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'])) for r in range(2): getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel'])) getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias'])) block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale'])) block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias'])) def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder): # interpolate position embedding embedding_size = pos_embed_checkpoint.shape[-1] num_patches = visual_encoder.patch_embed.num_patches num_extra_tokens = visual_encoder.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) # height (== width) for the new position embedding new_size = int(num_patches ** 0.5) if orig_size!=new_size: # class_token and dist_token are kept unchanged extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) print('reshape position embedding from %d to %d'%(orig_size ** 2,new_size ** 2)) return new_pos_embed else: return pos_embed_checkpoint ================================================ FILE: requirements.txt ================================================ pillow==9.1.1 opencv-python-headless==4.11.0.86 tqdm==4.65.2 omegaconf==2.3.0 torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 xformers==0.0.28.post1 fairscale==0.4.13 loralib==0.1.2 timm==0.9.16 pyiqa==0.1.13 transformers==4.37.2 diffusers==0.32.2 peft==0.13.2 pytorch-lightning==2.4.0 ================================================ FILE: test.py ================================================ import torch, os, glob, copy import torch.nn.functional as F import numpy as np from PIL import Image from argparse import ArgumentParser from torchvision import transforms from model import Net parser = ArgumentParser() parser.add_argument("--epoch", type=int, default=200) parser.add_argument("--model_dir", type=str, default="weight") parser.add_argument("--LR_dir", type=str, default="testset/RealSR/LR") parser.add_argument("--HR_dir", type=str, default="testset/RealSR/HR") parser.add_argument("--SR_dir", type=str, default="result/RealSR") args = parser.parse_args() device = torch.device("cuda") from diffusers import StableDiffusionPipeline model_id = "stabilityai/stable-diffusion-2-1-base" pipe = StableDiffusionPipeline.from_pretrained(model_id).to(device) vae = pipe.vae tokenizer = pipe.tokenizer unet = pipe.unet noise_scheduler = pipe.scheduler text_encoder = pipe.text_encoder from diffusers.models.autoencoders.vae import Decoder ckpt_halfdecoder = torch.load("./weight/pretrained/halfDecoder.ckpt", weights_only=False) decoder = Decoder(in_channels=4, out_channels=3, up_block_types=["UpDecoderBlock2D" for _ in range(4)], block_out_channels=[64, 128, 256, 256], layers_per_block=2, norm_num_groups=32, act_fn="silu", norm_type="group", mid_block_add_attention=True).to(device) decoder_ckpt = {} for k,v in ckpt_halfdecoder["state_dict"].items(): if "decoder" in k: new_k = k.replace("decoder.", "") decoder_ckpt[new_k] = v decoder.load_state_dict(decoder_ckpt, strict=True) model = torch.nn.DataParallel(Net(unet, copy.deepcopy(decoder))) model.load_state_dict(torch.load("./%s/net_params_%d.pkl" % (args.model_dir, args.epoch), weights_only=False)) model = torch.nn.Sequential( model.module, *decoder.up_blocks, decoder.conv_norm_out, decoder.conv_act, decoder.conv_out, ).to(device) test_LR_paths = list(sorted(glob.glob(os.path.join(args.LR_dir, "*.png")))) test_HR_paths = list(sorted(glob.glob(os.path.join(args.HR_dir, "*.png")))) os.makedirs(args.SR_dir, exist_ok=True) with torch.no_grad(): for i, path in enumerate(test_LR_paths): LR = Image.open(path).convert("RGB") LR = transforms.ToTensor()(LR).to(device).unsqueeze(0) * 2 - 1 SR = model(LR) SR = (SR - SR.mean(dim=[2,3],keepdim=True)) / SR.std(dim=[2,3],keepdim=True) \ * LR.std(dim=[2,3],keepdim=True) + LR.mean(dim=[2,3],keepdim=True) SR = transforms.ToPILImage()((SR[0] / 2 + 0.5).clamp(0, 1).cpu()) SR.save(os.path.join(args.SR_dir, os.path.basename(path))) ================================================ FILE: test_debug.sh ================================================ HF_ENDPOINT=https://hf-mirror.com \ CUDA_VISIBLE_DEVICES=0 \ python -u test.py \ --epoch=200 \ --LR_dir=testset/RealSR/LR \ --SR_dir=result/RealSR ================================================ FILE: train.py ================================================ import torch, os, glob, random, copy import torch.nn.functional as F from torch.utils.data import DataLoader import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP import numpy as np from argparse import ArgumentParser from time import time from tqdm import tqdm from omegaconf import OmegaConf from dataset import RealESRGANDataset, RealESRGANDegrader from model import Net from ram.models.ram_lora import ram from torchvision import transforms from utils import add_lora_to_unet dist.init_process_group(backend="nccl", init_method="env://") rank = dist.get_rank() world_size = dist.get_world_size() parser = ArgumentParser() parser.add_argument("--epoch", type=int, default=200) parser.add_argument("--batch_size", type=int, default=12) parser.add_argument("--learning_rate", type=float, default=1e-4) parser.add_argument("--model_dir", type=str, default="weight") parser.add_argument("--log_dir", type=str, default="log") parser.add_argument("--save_interval", type=int, default=10) args = parser.parse_args() # fixed seed for reproduction seed = rank random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) config = OmegaConf.load("config.yml") epoch = args.epoch learning_rate = args.learning_rate bsz = args.batch_size device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu") torch.backends.cudnn.allow_tf32 = True torch.backends.cuda.matmul.allow_tf32 = True if rank == 0: print("batch size per gpu =", bsz) from diffusers import StableDiffusionPipeline model_id = "stabilityai/stable-diffusion-2-1-base" pipe = StableDiffusionPipeline.from_pretrained(model_id).to(device) vae = pipe.vae tokenizer = pipe.tokenizer unet = pipe.unet text_encoder = pipe.text_encoder unet_D = copy.deepcopy(unet) new_conv_in = torch.nn.Conv2d(256, 320, 3, padding=1).to(device) new_conv_in.weight.data = unet_D.conv_in.weight.data.repeat(1, 64, 1, 1) / 64 new_conv_in.bias.data = unet_D.conv_in.bias.data unet_D.conv_in = new_conv_in unet_D = add_lora_to_unet(unet_D) unet_D.set_adapters(["default_encoder", "default_decoder", "default_others"]) vae_teacher = copy.deepcopy(vae) unet_teacher = copy.deepcopy(unet) osediff = torch.load("./weight/pretrained/osediff.pkl", weights_only=False) vae_teacher.load_state_dict(osediff["vae"]) unet_teacher.load_state_dict(osediff["unet"]) from diffusers.models.autoencoders.vae import Decoder ckpt_halfdecoder = torch.load("./weight/pretrained/halfDecoder.ckpt", weights_only=False) decoder = Decoder(in_channels=4, out_channels=3, up_block_types=["UpDecoderBlock2D" for _ in range(4)], block_out_channels=[64, 128, 256, 256], layers_per_block=2, norm_num_groups=32, act_fn="silu", norm_type="group", mid_block_add_attention=True).to(device) decoder_ckpt = {} for k, v in ckpt_halfdecoder["state_dict"].items(): if "decoder" in k: new_k = k.replace("decoder.", "") decoder_ckpt[new_k] = v decoder.load_state_dict(decoder_ckpt, strict=True) ram_transforms = transforms.Compose([ transforms.Resize((384, 384)), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) DAPE = ram(pretrained="./weight/pretrained/ram_swin_large_14m.pth", pretrained_condition="./weight/pretrained/DAPE.pth", image_size=384, vit="swin_l").eval().to(device) vae.requires_grad_(False) unet.requires_grad_(False) text_encoder.requires_grad_(False) vae_teacher.requires_grad_(False) unet_teacher.requires_grad_(False) decoder.requires_grad_(False) DAPE.requires_grad_(False) model = DDP(Net(unet, copy.deepcopy(decoder)).to(device), device_ids=[rank]) model_D = DDP(unet_D.to(device), device_ids=[rank]) model.requires_grad_(True) model_D.requires_grad_(False) params_to_opt = [] for n, p in model_D.named_parameters(): if "lora" in n or "conv_in" in n: p.requires_grad = True params_to_opt.append(p) if rank == 0: param_cnt = sum(p.numel() for p in model.parameters() if p.requires_grad) print("#Param.", param_cnt/1e6, "M") dataset = RealESRGANDataset(config, bsz) degrader = RealESRGANDegrader(config, device) dataloader = DataLoader(dataset, batch_size=bsz, num_workers=8) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) optimizer_D = torch.optim.Adam(params_to_opt, lr=1e-6) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100,], gamma=0.5) scaler = torch.cuda.amp.GradScaler() model_dir = "./%s" % (args.model_dir,) log_path = "./%s/log.txt" % (args.log_dir,) os.makedirs(model_dir, exist_ok=True) os.makedirs(args.log_dir, exist_ok=True) print("start training...") timesteps = torch.tensor([999], device=device).long().expand(bsz,) alpha = pipe.scheduler.alphas_cumprod[999] for epoch_i in range(1, epoch + 1): start_time = time() loss_avg = 0.0 loss_distil_avg = 0.0 loss_adv_avg = 0.0 loss_D_avg = 0.0 iter_num = 0 dist.barrier() for batch in tqdm(dataloader): with torch.cuda.amp.autocast(enabled=True): with torch.no_grad(): LR, HR = degrader.degrade(batch) text_input = tokenizer(DAPE.generate_tag(ram_transforms(LR))[0], max_length=tokenizer.model_max_length, padding="max_length", return_tensors="pt").to(device) encoder_hidden_states = text_encoder(text_input.input_ids, return_dict=False)[0] LR, HR = LR * 2 - 1, HR * 2 - 1 LR_ = F.interpolate(LR, scale_factor=4, mode="bicubic") LR_latents = vae_teacher.encode(LR_).latent_dist.mean * vae_teacher.config.scaling_factor HR_latents = vae.encode(HR).latent_dist.mean pred_teacher = unet_teacher( LR_latents, timesteps, encoder_hidden_states=encoder_hidden_states, return_dict=False, )[0] z0_teacher = (LR_latents-((1-alpha)**0.5)*pred_teacher)/(alpha**0.5) z0_teacher = vae_teacher.post_quant_conv(z0_teacher / vae_teacher.config.scaling_factor) z0_teacher = decoder.conv_in(z0_teacher) z0_teacher = decoder.mid_block(z0_teacher) z0_gt = vae.post_quant_conv(HR_latents) z0_gt = decoder.conv_in(z0_gt) z0_gt = decoder.mid_block(z0_gt) z0_student = model(LR) loss_distil = (z0_student - z0_teacher).abs().mean() loss_adv = F.softplus(-model_D( z0_student, timesteps, encoder_hidden_states=encoder_hidden_states, return_dict=False, )[0]).mean() loss = loss_distil + loss_adv optimizer.zero_grad(set_to_none=True) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() with torch.cuda.amp.autocast(enabled=True): pred_real = model_D( z0_gt.detach(), timesteps, encoder_hidden_states=encoder_hidden_states, return_dict=False, )[0] pred_fake = model_D( z0_student.detach(), timesteps, encoder_hidden_states=encoder_hidden_states, return_dict=False, )[0] loss_D = F.softplus(pred_fake).mean() + F.softplus(-pred_real).mean() optimizer_D.zero_grad(set_to_none=True) scaler.scale(loss_D).backward() scaler.step(optimizer_D) scaler.update() loss_avg += loss.item() loss_distil_avg += loss_distil.item() loss_adv_avg += loss_adv.item() loss_D_avg += loss_D.item() iter_num += 1 # print("loss", loss.item()) # print("loss_distil", loss_distil.item()) # print("loss_adv", loss_adv.item()) # print("loss_D", loss_D.item()) scheduler.step() loss_avg /= iter_num loss_distil_avg /= iter_num loss_adv_avg /= iter_num loss_D_avg /= iter_num log_data = "[%d/%d] Average loss: %f, distil loss: %f, adv loss: %f, D loss: %f, time cost: %.2fs, cur lr is %f." % (epoch_i, epoch, loss_avg, loss_distil_avg, loss_adv_avg, loss_D_avg, time() - start_time, scheduler.get_last_lr()[0]) if rank == 0: print(log_data) with open(log_path, "a") as log_file: log_file.write(log_data + "\n") if epoch_i % args.save_interval == 0: torch.save(model.state_dict(), "./%s/net_params_%d.pkl" % (model_dir, epoch_i)) ================================================ FILE: train.sh ================================================ HF_ENDPOINT=https://hf-mirror.com \ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ python -m torch.distributed.run \ --nproc_per_node=8 \ --master_port=23333 \ train.py > g0-7.txt 2>&1 ================================================ FILE: train_debug.sh ================================================ HF_ENDPOINT=https://hf-mirror.com \ CUDA_VISIBLE_DEVICES=0 \ nohup torchrun \ --nproc_per_node=1 \ --master_port=23333 \ train.py \ --batch_size=1 > g0.txt 2>&1 & ================================================ FILE: utils.py ================================================ import torch from peft import LoraConfig def add_lora_to_unet(unet, rank=4): l_target_modules_encoder, l_target_modules_decoder, l_modules_others = [], [], [] l_grep = ["to_k", "to_q", "to_v", "to_out.0", "conv", "conv1", "conv2", "conv_shortcut", "conv_out", "proj_out", "proj_in", "ff.net.2", "ff.net.0.proj"] for n, p in unet.named_parameters(): check_flag = 0 if "bias" in n or "norm" in n: continue for pattern in l_grep: if pattern in n and ("down_blocks" in n or "conv_in" in n): l_target_modules_encoder.append(n.replace(".weight","")) break elif pattern in n and ("up_blocks" in n or "conv_out" in n): l_target_modules_decoder.append(n.replace(".weight","")) break elif pattern in n: l_modules_others.append(n.replace(".weight","")) break unet.add_adapter(LoraConfig(r=rank,init_lora_weights="gaussian",target_modules=l_target_modules_encoder), adapter_name="default_encoder") unet.add_adapter(LoraConfig(r=rank,init_lora_weights="gaussian",target_modules=l_target_modules_decoder), adapter_name="default_decoder") unet.add_adapter(LoraConfig(r=rank,init_lora_weights="gaussian",target_modules=l_modules_others), adapter_name="default_others") return unet