Repository: zhang-zx/SINE Branch: master Commit: c0df9502b622 Files: 51 Total size: 480.7 KB Directory structure: gitextract_l03wpjs2/ ├── .gitignore ├── LICENSE ├── README.md ├── SINE.ipynb ├── configs/ │ └── stable-diffusion/ │ ├── v1-finetune_painting.yaml │ ├── v1-finetune_painting_style.yaml │ ├── v1-finetune_patch_painting.yaml │ ├── v1-finetune_patch_picture.yaml │ ├── v1-finetune_picture.yaml │ ├── v1-inference.yaml │ ├── v1-inference_patch.yaml │ └── v1-inference_patch_nearest_interp.yaml ├── diffusers_models.py ├── diffusers_sample.py ├── diffusers_train.py ├── environment.yml ├── ldm/ │ ├── data/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── personalized.py │ │ └── personalized_painting.py │ ├── lr_scheduler.py │ ├── modules/ │ │ ├── attention.py │ │ ├── diffusionmodules/ │ │ │ ├── __init__.py │ │ │ ├── model.py │ │ │ ├── openaimodel.py │ │ │ ├── positional_encoding.py │ │ │ └── util.py │ │ ├── distributions/ │ │ │ ├── __init__.py │ │ │ └── distributions.py │ │ ├── ema.py │ │ ├── embedding_manager.py │ │ ├── encoders/ │ │ │ ├── __init__.py │ │ │ ├── modules.py │ │ │ └── modules_bak.py │ │ ├── image_degradation/ │ │ │ ├── __init__.py │ │ │ ├── bsrgan.py │ │ │ ├── bsrgan_light.py │ │ │ └── utils_image.py │ │ ├── losses/ │ │ │ ├── __init__.py │ │ │ ├── contperceptual.py │ │ │ └── vqperceptual.py │ │ └── x_transformer.py │ └── util.py ├── main.py ├── scripts/ │ ├── download_first_stages.sh │ ├── download_models.sh │ ├── sample_diffusion.py │ ├── score.py │ ├── stable_txt2img_guidance.py │ └── stable_txt2img_multi_guidance.py └── setup.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # General .DS_Store .AppleDouble .LSOverride models # models/ logs logs/ exps/ datasets/ src/ # Icon must end with two \r Icon # Thumbnails ._* # Files that might appear in the root of a volume .DocumentRevisions-V100 .fseventsd .Spotlight-V100 .TemporaryItems .Trashes .VolumeIcon.icns .com.apple.timemachine.donotpresent # Directories potentially created on remote AFP share .AppleDB .AppleDesktop Network Trash Folder Temporary Items .apdisk # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments */logs/ */wandb/ *samples/ .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2022 Rinon Gal, Yuval Alaluf, Yuval Atzmon, Or Patashnik and contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ ## SINE
SINgle Image Editing with Text-to-Image Diffusion Models [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zhang-zx/SINE/blob/master/SINE.ipynb) [Project](https://zhang-zx.github.io/SINE/) | [ArXiv](https://arxiv.org/abs/2212.04489) This respository contains the code for the CVPR 2023 paper [SINE: SINgle Image Editing with Text-to-Image Diffusion Models](https://arxiv.org/abs/2212.04489). For more visualization results, please check our [webpage](https://zhang-zx.github.io/SINE/). > **[SINE: SINgle Image Editing with Text-to-Image Diffusion Models](https://zhang-zx.github.io/SINE/)** \ > [Zhixing Zhang](https://zhang-zx.github.io/) 1, > [Ligong Han](https://phymhan.github.io/) 1, > [Arnab Ghosh](https://arnabgho.github.io/) 2, > [Dimitris Metaxas](https://people.cs.rutgers.edu/~dnm/) 1, > and [Jian Ren](https://alanspike.github.io/) 2 \ > 1 Rutgers University > 2 Snap Inc.\ > CVPR 2023.
## Setup First, clone the repository and install the dependencies: ```bash git clone git@github.com:zhang-zx/SINE.git ``` Then, install the dependencies following the [instructions](https://github.com/CompVis/stable-diffusion#stable-diffusion-v1). Alternatively, you can also try to use the following docker image. ```bash docker pull sunggukcha/sine ``` To fine-tune the model, you need to download the [pre-trained model](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4-full-ema.ckpt). ### Data Preparation The data we use in the paper can be found from [here](https://drive.google.com/drive/folders/1rGt5YTCwNgEag8MD_1wr9jrPpi1_8vfu?usp=sharing). ## Fine-tuning ### Fine-tuning w/o patch-based training scheme ```bash IMG_PATH=path/to/image CLS_WRD='coarse class word' NAME='name of the experiment' python main.py \ --base configs/stable-diffusion/v1-finetune_picture.yaml \ -t --actual_resume /path/to/pre-trained/model \ -n $NAME --gpus 0, --logdir ./logs \ --data_root $IMG_PATH \ --reg_data_root $IMG_PATH --class_word $CLS_WRD ``` ### Fine-tuning with patch-based training scheme ```bash IMG_PATH=path/to/image CLS_WRD='coarse class word' NAME='name of the experiment' python main.py \ --base configs/stable-diffusion/v1-finetune_patch_picture.yaml \ -t --actual_resume /path/to/pre-trained/model \ -n $NAME --gpus 0, --logdir ./logs \ --data_root $IMG_PATH \ --reg_data_root $IMG_PATH --class_word $CLS_WRD ``` ## Model-based Image Editing ### Editing with one model's guidance ```bash LOG_DIR=/path/to/logdir python scripts/stable_txt2img_guidance.py --ddim_eta 0.0 --n_iter 1 \ --scale 10 --ddim_steps 100 \ --sin_config configs/stable-diffusion/v1-inference.yaml \ --sin_ckpt $LOG_DIR"/checkpoints/last.ckpt" \ --prompt "prompt for pre-trained model[SEP]prompt for fine-tuned model" \ --cond_beta 0.4 \ --range_t_min 500 --range_t_max 1000 --single_guidance \ --skip_save --H 512 --W 512 --n_samples 2 \ --outdir $LOG_DIR ``` ### Editing with multiple models' guidance ```bash python scripts/stable_txt2img_multi_guidance.py --ddim_eta 0.0 --n_iter 2 \ --scale 10 --ddim_steps 100 \ --sin_ckpt path/to/ckpt1 path/to/ckpt2 \ --sin_config ./configs/stable-diffusion/v1-inference.yaml \ configs/stable-diffusion/v1-inference.yaml \ --prompt "prompt for pre-trained model[SEP]prompt for fine-tuned model1[SEP]prompt for fine-tuned model2" \ --beta 0.4 0.5 \ --range_t_min 400 400 --range_t_max 1000 1000 --single_guidance \ --H 512 --W 512 --n_samples 2 \ --outdir path/to/output_dir ``` ## Diffusers library Example The Diffusers Library support is still under development. Results in our paper are obtained using previous code based on LDM. ### Training ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" export IMG_PATH="path/to/image" export OUTPUT_DIR="path/to/output_dir" accelerate launch diffusers_train.py \ --pretrained_model_name_or_path=$MODEL_NAME \ --train_text_encoder \ --img_path=$IMG_PATH \ --output_dir=$OUTPUT_DIR \ --instance_prompt="prompt for fine-tuning" \ --resolution=512 \ --train_batch_size=1 \ --gradient_accumulation_steps=1 \ --learning_rate=1e-6 \ --lr_scheduler="constant" \ --lr_warmup_steps=0 \ --max_train_steps=NUMBERS_OF_STEPS \ --checkpointing_steps=FREQUENCY_FOR_CHECKPOINTING \ --patch_based_training # OPTIONAL: add this flag for patch-based training scheme ``` ### Sampling ```bash python diffusers_sample.py \ --pretrained_model_name_or_path "path/to/output_dir" \ --prompt "prompt for fine-tuned model" \ --editing_prompt 'prompt for pre-trained model' ``` ## Visualization Results Some of the editing results are shown below. See more results on our [webpage](https://zhang-zx.github.io/SINE/). ![image](assets/editing.png) ## Acknowledgments In this code we refer to the following implementations: [Dreambooth-Stable-Diffusion](https://github.com/XavierXiao/Dreambooth-Stable-Diffusion) and [stable-diffusion](https://github.com/CompVis/stable-diffusion#stable-diffusion-v1). Implementation with the Diffusers Library support is highly based on [Dreambooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth). Great thanks to them! ## Reference If our work or code helps you, please consider to cite our paper. Thank you! ```BibTeX @article{zhang2022sine, title={SINE: SINgle Image Editing with Text-to-Image Diffusion Models}, author={Zhang, Zhixing and Han, Ligong and Ghosh, Arnab and Metaxas, Dimitris and Ren, Jian}, journal={arXiv preprint arXiv:2212.04489}, year={2022} } ``` ================================================ FILE: SINE.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Welcome to SINE: SINgle Image Editing with Text-to-Image Diffusion Models!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "gpu_info = !nvidia-smi\n", "gpu_info = '\\n'.join(gpu_info)\n", "if gpu_info.find('failed') >= 0:\n", " print('Not connected to a GPU')\n", "else:\n", " print(gpu_info)\n", "\n", "from psutil import virtual_memory\n", "ram_gb = virtual_memory().total / 1e9\n", "print('Your runtime has {:.1f} gigabytes of available RAM\\n'.format(ram_gb))\n", "\n", "if ram_gb < 20:\n", " print('Not using a high-RAM runtime')\n", "else:\n", " print('You are using a high-RAM runtime!')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Step 1: Setup required libraries and models. \n", "This may take a few minutes.\n", "\n", "You may optionally enable downloads with pydrive in order to authenticate and avoid drive download limits when fetching the pre-trained model." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#@title Setup\n", "\n", "import os\n", "\n", "from pydrive.auth import GoogleAuth\n", "from pydrive.drive import GoogleDrive\n", "from google.colab import auth\n", "from oauth2client.client import GoogleCredentials\n", "\n", "from argparse import Namespace\n", "\n", "import sys\n", "import numpy as np\n", "\n", "from PIL import Image\n", "\n", "import torch\n", "import torchvision.transforms as transforms\n", "\n", "device = 'cuda'\n", "\n", "\n", "# install requirements\n", "!git clone https://github.com/zhang-zx/SINE.git sine_dir\n", "\n", "%cd sine_dir/\n", "!pip uninstall -y torchtext\n", "! pip install transformers==4.18.0 einops==0.4.1 omegaconf==2.1.1 torchmetrics==0.6.0 torch-fidelity==0.3.0 kornia==0.6 albumentations==1.1.0 opencv-python==4.2.0.34 imageio==2.14.1 setuptools==59.5.0 pillow==9.0.1 \n", "! pip install torch==1.10.2 torchvision==0.11.3\n", "! pip install pytorch-lightning==1.5.9\n", "! pip install git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers\n", "! pip install git+https://github.com/openai/CLIP.git@main#egg=clip\n", "! pip install -e .\n", "\n", "\n", "\n", "download_with_pydrive = True \n", " \n", "class Downloader(object):\n", " def __init__(self, use_pydrive):\n", " self.use_pydrive = use_pydrive\n", "\n", " if self.use_pydrive:\n", " self.authenticate()\n", " \n", " def authenticate(self):\n", " auth.authenticate_user()\n", " gauth = GoogleAuth()\n", " gauth.credentials = GoogleCredentials.get_application_default()\n", " self.drive = GoogleDrive(gauth)\n", " \n", " def download_file(self, file_id, file_dst):\n", " if self.use_pydrive:\n", " downloaded = self.drive.CreateFile({'id':file_id})\n", " downloaded.FetchMetadata(fetch_all=True)\n", " downloaded.GetContentFile(file_dst)\n", " else:\n", " !gdown --id $file_id -O $file_dst\n", "\n", "downloader = Downloader(download_with_pydrive)\n", "\n", "pre_trained_path = os.path.join('models', 'ldm', 'stable-diffusion-v4')\n", "os.makedirs(pre_trained_path, exist_ok=True)\n", "!wget -O models/ldm/stable-diffusion-v4/sd-v1-4-full-ema.ckpt https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4-full-ema.ckpt \n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Step 2: Download the selected fine-tuned model. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "finetuned_models_dir = os.path.join('./models', 'finetuned')\n", "os.makedirs(finetuned_models_dir, exist_ok=True)\n", "\n", "orig_image_dir = './dataset'\n", "os.makedirs(orig_image_dir, exist_ok=True)\n", "\n", "source_model_type = 'dog w/o patch-based fine-tuning' #@param['dog w/o patch-based fine-tuning', 'dog w/ patch-based fine-tuning', 'Girl with a peral earring', 'Monalisa', 'castle w/o patch-based fine-tuning', 'castle w/ patch-based fine-tuning']\n", "source_model_download_path = {\"dog w/o patch-based fine-tuning\": \"1jHgkyxrwUXyMR2zBK9WAEWioEP3-F3fd\",\n", " \"dog w/ patch-based fine-tuning\": \"1YI7c29qBIy83OqJ4ykoAAul6P8uaXmls\",\n", " \"Girl with a peral earring\": \"1l6GCEfyURKQiCF77ZriYoZRtkOXQCyWD\",\n", " \"Monalisa\": \"194CDgHkomKrLvgFj89kamoTjUbMwyUIC\",\n", " \"castle w/o patch-based fine-tuning\": \"19I8ftab9vMQWnqPH2O7aHe-GnYolmVFF\",\n", " \"castle w/ patch-based fine-tuning\": \"1srzUr1fg6jTFKuf0M5oi5JgBsVhCt-nb\"}\n", "\n", "model_names = { \"dog w/o patch-based fine-tuning\": \"dog_wo_patch.ckpt\",\n", " \"dog w/ patch-based fine-tuning\": \"dog_w_patch.ckpt\",\n", " \"Girl with a peral earring\": \"girl.ckpt\",\n", " \"Monalisa\": \"monalisa.ckpt\",\n", " \"castle w/o patch-based fine-tuning\": \"castle_wo_patch\",\n", " \"castle w/ patch-based fine-tuning\": \"castle_w_patch\"}\n", "\n", "model_configs = { \"dog w/o patch-based fine-tuning\": \"./configs/stable-diffusion/v1-inference.yaml\",\n", " \"dog w/ patch-based fine-tuning\": \"./configs/stable-diffusion/v1-inference_patch.yaml\",\n", " \"Girl with a peral earring\": \"./configs/stable-diffusion/v1-inference_patch_nearest.yaml\",\n", " \"Monalisa\": \"./configs/stable-diffusion/v1-inference_patch_nearest.yaml\",\n", " \"castle w/o patch-based fine-tuning\": \"./configs/stable-diffusion/v1-inference.yaml\",\n", " \"castle w/ patch-based fine-tuning\": \"./configs/stable-diffusion/v1-inference_patch.yaml\"}\n", "\n", "orig_prompts = { \"dog w/o patch-based fine-tuning\": \"picture of a sks dog\",\n", " \"dog w/ patch-based fine-tuning\": \"picture of a sks dog\",\n", " \"Girl with a peral earring\": \"painting of a sks girl\",\n", " \"Monalisa\": \"painting of a sks lady\",\n", " \"castle w/o patch-based fine-tuning\": \"picture of a sks castle\",\n", " \"castle w/ patch-based fine-tuning\": \"picture of a sks castle\"}\n", "\n", "download_string = source_model_download_path[source_model_type]\n", "file_name = model_names[source_model_type]\n", "\n", "config_name = model_configs[source_model_type]\n", "fine_tune_prompt = orig_prompts[source_model_type]\n", "\n", "if not os.path.isfile(os.path.join(finetuned_models_dir, file_name)):\n", " downloader.download_file(download_string, os.path.join(finetuned_models_dir, file_name))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Step3: Edit the image with model-based guidance" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import argparse, os, sys, glob\n", "import torch\n", "import numpy as np\n", "from omegaconf import OmegaConf\n", "from PIL import Image\n", "from tqdm import tqdm, trange\n", "from itertools import islice\n", "from einops import rearrange\n", "from torchvision.utils import make_grid, save_image\n", "import time\n", "from pytorch_lightning import seed_everything\n", "from torch import autocast\n", "from contextlib import contextmanager, nullcontext\n", "\n", "from ldm.util import instantiate_from_config\n", "from ldm.models.diffusion.ddim import DDIMSampler\n", "from ldm.models.diffusion.plms import PLMSSampler\n", "from IPython.display import display\n", "\n", "\n", "def chunk(it, size):\n", " it = iter(it)\n", " return iter(lambda: tuple(islice(it, size)), ())\n", "\n", "\n", "def load_model_from_config(config, ckpt, verbose=False):\n", " print(f\"Loading model from {ckpt}\")\n", " pl_sd = torch.load(ckpt, map_location=\"cpu\")\n", " if \"global_step\" in pl_sd:\n", " print(f\"Global Step: {pl_sd['global_step']}\")\n", " sd = pl_sd[\"state_dict\"]\n", " model = instantiate_from_config(config.model)\n", " m, u = model.load_state_dict(sd, strict=False)\n", " if len(m) > 0 and verbose:\n", " print(\"missing keys:\")\n", " print(m)\n", " if len(u) > 0 and verbose:\n", " print(\"unexpected keys:\")\n", " print(u)\n", "\n", " model.cuda()\n", " model.eval()\n", " return model\n", "\n", "seed = 42\n", "config = OmegaConf.load('configs/stable-diffusion/v1-inference.yaml')\n", "model = load_model_from_config(config, 'models/ldm/stable-diffusion-v4/sd-v1-4-full-ema.ckpt')\n", "\n", "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", "model = model.to(device)\n", "\n", "sin_config = OmegaConf.load(f\"{config_name}\")\n", "sin_model = load_model_from_config(config, os.path.join(finetuned_models_dir, file_name))\n", "sin_model = sin_model.to(device)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "v = 0.7 #@param {type:\"slider\", min:0, max:1, step:0.05}\n", "K_min = 400 #@param {type:\"slider\", min:0, max:1000, step:10}\n", "scale = 7.5 #@param {type:\"slider\", min:1.0, max:50, step:0.5}\n", "ddim_steps = 100\n", "ddim_eta = 0.\n", "H = 512\n", "W = 512\n", "\n", "prompt = \"a dog wearing a superhero cape\" #@param {'type': 'string'}\n", "\n", "extra_config = {\n", " 'cond_beta': v,\n", " 'cond_beta_sin': 1. - v,\n", " 'range_t_max': 1000,\n", " 'range_t_min': K_min\n", "}\n", "\n", "\n", "from ldm.models.diffusion.guidance_ddim import DDIMSinSampler\n", "sampler = DDIMSinSampler(model, sin_model)\n", "\n", "setattr(sampler.model, 'extra_config', extra_config)\n", "\n", "\n", "batch_size = 1\n", "n_rows = 2\n", "start_code = None\n", "precision_scope = autocast\n", "num_samples = 4\n", "\n", "all_samples = list()\n", "\n", "with torch.no_grad():\n", " with precision_scope(\"cuda\"):\n", " with model.ema_scope():\n", " with sin_model.ema_scope():\n", " tic = time.time()\n", " all_samples = list()\n", " for n in trange(num_samples, desc=\"Sampling\"): \n", " uc = None\n", " if scale != 1.0:\n", " uc = model.get_learned_conditioning(batch_size * [\"\"])\n", " uc_sin = sin_model.get_learned_conditioning(batch_size * [\"\"])\n", "\n", " prompts = [prompt] * batch_size\n", " prompts_single = [fine_tune_prompt] * batch_size\n", " \n", " c = model.get_learned_conditioning(prompts)\n", " c_sin = sin_model.get_learned_conditioning(prompts_single)\n", " \n", " shape = [4, H // 8, W // 8]\n", " samples_ddim, _ = sampler.sample( S=ddim_steps,\n", " conditioning=c,\n", " conditioning_single=c_sin,\n", " batch_size=batch_size,\n", " shape=shape,\n", " verbose=False,\n", " unconditional_guidance_scale=scale,\n", " unconditional_conditioning=uc,\n", " unconditional_conditioning_single=uc_sin,\n", " eta=ddim_eta,\n", " x_T=start_code)\n", "\n", " x_samples_ddim = model.decode_first_stage(samples_ddim)\n", " x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)\n", "\n", " all_samples.append(x_samples_ddim)\n", "\n", " grid = torch.stack(all_samples, 0)\n", " grid = rearrange(grid, 'n b c h w -> (n b) c h w')\n", " \n", " grid = make_grid(grid, nrow=n_rows)\n", "\n", " # to image\n", " grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()\n", " os.makedirs('./output', exist_ok=True)\n", " Image.fromarray(grid.astype(np.uint8)).save(os.path.join('./output', f'{prompt.replace(\" \", \"-\")}.jpg'))\n", " display(Image.open(os.path.join('./output', f'{prompt.replace(\" \", \"-\")}.jpg')))\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.8.0 ('SINE')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "a84f578b9cda1db545aa6690161d7775d6ea32a647f25bb9ef4866c136688289" } } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: configs/stable-diffusion/v1-finetune_painting.yaml ================================================ model: base_learning_rate: 1.0e-06 target: ldm.models.diffusion.ddpm.LatentDiffusion params: reg_weight: 0.0 linear_start: 0.00085 linear_end: 0.0120 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: image cond_stage_key: caption image_size: 64 channels: 4 cond_stage_trainable: true # Note: different from the one we trained before conditioning_key: crossattn monitor: val/loss_simple_ema scale_factor: 0.18215 use_ema: False embedding_reg_weight: 0.0 unfreeze_model: True model_lr: 1.0e-06 personalization_config: target: ldm.modules.embedding_manager.EmbeddingManager params: placeholder_strings: ["*"] initializer_words: ["sculpture"] per_image_tokens: false num_vectors_per_token: 1 progressive_words: False unet_config: target: ldm.modules.diffusionmodules.openaimodel.UNetModel params: image_size: 32 # unused in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False first_stage_config: target: ldm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 512 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity cond_stage_config: target: ldm.modules.encoders.modules.FrozenCLIPEmbedder data: target: main.DataModuleFromConfig params: batch_size: 1 num_workers: 2 wrap: false train: target: ldm.data.personalized_painting.SinImageDataset params: size: 512 set: train per_image_tokens: false repeats: 100 flip_p: 0.0 reg: target: ldm.data.personalized_painting.SinImageDataset params: size: 512 set: train per_image_tokens: false repeats: 100 flip_p: 0.0 validation: target: ldm.data.personalized_painting.SinImageDataset params: size: 512 set: val per_image_tokens: false repeats: 10 lightning: modelcheckpoint: params: every_n_train_steps: 500 callbacks: image_logger: target: main.ImageLogger params: batch_frequency: 500 max_images: 8 increase_log_steps: False trainer: benchmark: True max_steps: 800 ================================================ FILE: configs/stable-diffusion/v1-finetune_painting_style.yaml ================================================ model: base_learning_rate: 1.0e-06 target: ldm.models.diffusion.ddpm.LatentDiffusion params: reg_weight: 0.0 linear_start: 0.00085 linear_end: 0.0120 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: image cond_stage_key: caption image_size: 64 channels: 4 cond_stage_trainable: true # Note: different from the one we trained before conditioning_key: crossattn monitor: val/loss_simple_ema scale_factor: 0.18215 use_ema: False embedding_reg_weight: 0.0 unfreeze_model: True model_lr: 1.0e-06 personalization_config: target: ldm.modules.embedding_manager.EmbeddingManager params: placeholder_strings: ["*"] initializer_words: ["sculpture"] per_image_tokens: false num_vectors_per_token: 1 progressive_words: False unet_config: target: ldm.modules.diffusionmodules.openaimodel.UNetModel params: image_size: 32 # unused in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False first_stage_config: target: ldm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 512 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity cond_stage_config: target: ldm.modules.encoders.modules.FrozenCLIPEmbedder data: target: main.DataModuleFromConfig params: batch_size: 1 num_workers: 2 wrap: false train: target: ldm.data.personalized_painting.SinImageDataset params: size: 512 set: train per_image_tokens: false repeats: 100 flip_p: 0.0 learn_style: true reg: target: ldm.data.personalized_painting.SinImageDataset params: size: 512 set: train per_image_tokens: false repeats: 100 flip_p: 0.0 validation: target: ldm.data.personalized_painting.SinImageDataset params: size: 512 set: val per_image_tokens: false repeats: 10 learn_style: true lightning: modelcheckpoint: params: every_n_train_steps: 500 callbacks: image_logger: target: main.ImageLogger params: batch_frequency: 500 max_images: 8 increase_log_steps: False trainer: benchmark: True max_steps: 800 ================================================ FILE: configs/stable-diffusion/v1-finetune_patch_painting.yaml ================================================ model: base_learning_rate: 1.0e-06 target: ldm.models.diffusion.ddpm.LatentDiffusion params: reg_weight: 0.0 linear_start: 0.00085 linear_end: 0.0120 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: image cond_stage_key: caption image_size: 64 channels: 4 cond_stage_trainable: true # Note: different from the one we trained before conditioning_key: crossattn monitor: val/loss_simple_ema scale_factor: 0.18215 use_ema: False embedding_reg_weight: 0.0 unfreeze_model: True model_lr: 1.0e-6 scale_recon_loss: 1.0 personalization_config: target: ldm.modules.embedding_manager.EmbeddingManager params: placeholder_strings: ["*"] initializer_words: ["sculpture"] per_image_tokens: false num_vectors_per_token: 1 progressive_words: False unet_config: target: ldm.modules.diffusionmodules.openaimodel.UNetModelPatch params: image_size: 32 # unused in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False padding_idx: 0 init_size: 128 div_half_dim: false center_shift: 100 interpolation_mode: "nearest" first_stage_config: target: ldm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 512 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity cond_stage_config: target: ldm.modules.encoders.modules.FrozenCLIPEmbedder data: target: main.DataModuleFromConfig params: batch_size: 1 num_workers: 2 wrap: false train: target: ldm.data.personalized_painting.SinImageHighResDataset params: size: 512 high_resolution: 1024 set: train per_image_tokens: false repeats: 100 min_crop_frac: 0.1 max_crop_frac: 1.0 rec_prob: 1. latent_scale: 8 reg: target: ldm.data.personalized_painting.SinImageHighResDataset params: size: 512 high_resolution: 1024 set: train per_image_tokens: false repeats: 1 min_crop_frac: 0.1 max_crop_frac: 1.0 rec_prob: 0. latent_scale: 8 validation: target: ldm.data.personalized_painting.SinImageHighResDataset params: size: 512 high_resolution: 1024 set: val per_image_tokens: false repeats: 10 min_crop_frac: 0.2 max_crop_frac: 1.0 rec_prob: 0.25 latent_scale: 8 lightning: modelcheckpoint: params: every_n_train_steps: 2000 callbacks: image_logger: target: main.ImageLogger params: batch_frequency: 2000 max_images: 8 increase_log_steps: False trainer: benchmark: True max_steps: 10000 ================================================ FILE: configs/stable-diffusion/v1-finetune_patch_picture.yaml ================================================ model: base_learning_rate: 1.0e-06 target: ldm.models.diffusion.ddpm.LatentDiffusion params: reg_weight: 0.0 linear_start: 0.00085 linear_end: 0.0120 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: image cond_stage_key: caption image_size: 64 channels: 4 cond_stage_trainable: true # Note: different from the one we trained before conditioning_key: crossattn monitor: val/loss_simple_ema scale_factor: 0.18215 use_ema: False embedding_reg_weight: 0.0 unfreeze_model: True model_lr: 1.0e-6 scale_recon_loss: 1.0 personalization_config: target: ldm.modules.embedding_manager.EmbeddingManager params: placeholder_strings: ["*"] initializer_words: ["sculpture"] per_image_tokens: false num_vectors_per_token: 1 progressive_words: False unet_config: target: ldm.modules.diffusionmodules.openaimodel.UNetModelPatch params: image_size: 32 # unused in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False padding_idx: 0 init_size: 128 div_half_dim: false center_shift: 100 interpolation_mode: "bilinear" first_stage_config: target: ldm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 512 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity cond_stage_config: target: ldm.modules.encoders.modules.FrozenCLIPEmbedder data: target: main.DataModuleFromConfig params: batch_size: 1 num_workers: 2 wrap: false train: target: ldm.data.personalized.SinImageHighResDataset params: size: 512 high_resolution: 1024 set: train per_image_tokens: false repeats: 100 min_crop_frac: 0.1 max_crop_frac: 1.0 rec_prob: 1. latent_scale: 8 reg: target: ldm.data.personalized.SinImageHighResDataset params: size: 512 high_resolution: 1024 set: train per_image_tokens: false repeats: 1 min_crop_frac: 0.1 max_crop_frac: 1.0 rec_prob: 0. latent_scale: 8 validation: target: ldm.data.personalized.SinImageHighResDataset params: size: 512 high_resolution: 1024 set: val per_image_tokens: false repeats: 10 min_crop_frac: 0.2 max_crop_frac: 1.0 rec_prob: 0.25 latent_scale: 8 lightning: modelcheckpoint: params: every_n_train_steps: 7000 callbacks: image_logger: target: main.ImageLogger params: batch_frequency: 2000 max_images: 8 increase_log_steps: False trainer: benchmark: True max_steps: 10000 ================================================ FILE: configs/stable-diffusion/v1-finetune_picture.yaml ================================================ model: base_learning_rate: 1.0e-06 target: ldm.models.diffusion.ddpm.LatentDiffusion params: reg_weight: 0.0 linear_start: 0.00085 linear_end: 0.0120 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: image cond_stage_key: caption image_size: 64 channels: 4 cond_stage_trainable: true # Note: different from the one we trained before conditioning_key: crossattn monitor: val/loss_simple_ema scale_factor: 0.18215 use_ema: False embedding_reg_weight: 0.0 unfreeze_model: True model_lr: 1.0e-06 personalization_config: target: ldm.modules.embedding_manager.EmbeddingManager params: placeholder_strings: ["*"] initializer_words: ["sculpture"] per_image_tokens: false num_vectors_per_token: 1 progressive_words: False unet_config: target: ldm.modules.diffusionmodules.openaimodel.UNetModel params: image_size: 32 # unused in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False first_stage_config: target: ldm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 512 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity cond_stage_config: target: ldm.modules.encoders.modules.FrozenCLIPEmbedder data: target: main.DataModuleFromConfig params: batch_size: 1 num_workers: 2 wrap: false train: target: ldm.data.personalized.SinImageDataset params: size: 512 set: train per_image_tokens: false repeats: 100 flip_p: 0. reg: target: ldm.data.personalized.SinImageDataset params: size: 512 set: train per_image_tokens: false repeats: 100 flip_p: 0. validation: target: ldm.data.personalized.SinImageDataset params: size: 512 set: val per_image_tokens: false repeats: 10 lightning: modelcheckpoint: params: every_n_train_steps: 800 callbacks: image_logger: target: main.ImageLogger params: batch_frequency: 500 max_images: 8 increase_log_steps: False trainer: benchmark: True max_steps: 800 ================================================ FILE: configs/stable-diffusion/v1-inference.yaml ================================================ model: base_learning_rate: 1.0e-04 target: ldm.models.diffusion.ddpm.LatentDiffusion params: linear_start: 0.00085 linear_end: 0.0120 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: "jpg" cond_stage_key: "txt" image_size: 64 channels: 4 cond_stage_trainable: false # Note: different from the one we trained before conditioning_key: crossattn monitor: val/loss_simple_ema scale_factor: 0.18215 use_ema: False personalization_config: target: ldm.modules.embedding_manager.EmbeddingManager params: placeholder_strings: ["*"] initializer_words: ["sculpture"] per_image_tokens: false num_vectors_per_token: 1 progressive_words: False unet_config: target: ldm.modules.diffusionmodules.openaimodel.UNetModel params: image_size: 32 # unused in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False first_stage_config: target: ldm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity cond_stage_config: target: ldm.modules.encoders.modules.FrozenCLIPEmbedder ================================================ FILE: configs/stable-diffusion/v1-inference_patch.yaml ================================================ model: base_learning_rate: 1.0e-04 target: ldm.models.diffusion.ddpm.LatentDiffusion params: linear_start: 0.00085 linear_end: 0.0120 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: "jpg" cond_stage_key: "txt" image_size: 64 channels: 4 cond_stage_trainable: false # Note: different from the one we trained before conditioning_key: crossattn monitor: val/loss_simple_ema scale_factor: 0.18215 use_ema: False personalization_config: target: ldm.modules.embedding_manager.EmbeddingManager params: placeholder_strings: ["*"] initializer_words: ["sculpture"] per_image_tokens: false num_vectors_per_token: 1 progressive_words: False unet_config: target: ldm.modules.diffusionmodules.openaimodel.UNetModelPatch params: image_size: 32 # unused in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False padding_idx: 0 init_size: 128 ## Note: Might be some problem in this line. Might need to be 1024 div_half_dim: false center_shift: 100 interpolation_mode: "bilinear" # bilinear or nearest supported first_stage_config: target: ldm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity cond_stage_config: target: ldm.modules.encoders.modules.FrozenCLIPEmbedder ================================================ FILE: configs/stable-diffusion/v1-inference_patch_nearest_interp.yaml ================================================ model: base_learning_rate: 1.0e-04 target: ldm.models.diffusion.ddpm.LatentDiffusion params: linear_start: 0.00085 linear_end: 0.0120 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: "jpg" cond_stage_key: "txt" image_size: 64 channels: 4 cond_stage_trainable: false # Note: different from the one we trained before conditioning_key: crossattn monitor: val/loss_simple_ema scale_factor: 0.18215 use_ema: False personalization_config: target: ldm.modules.embedding_manager.EmbeddingManager params: placeholder_strings: ["*"] initializer_words: ["sculpture"] per_image_tokens: false num_vectors_per_token: 1 progressive_words: False unet_config: target: ldm.modules.diffusionmodules.openaimodel.UNetModelPatch params: image_size: 32 # unused in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 2 channel_mult: [ 1, 2, 4, 4 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 768 use_checkpoint: True legacy: False padding_idx: 0 init_size: 128 ## Note: Might be some problem in this line. Might need to be 1024 div_half_dim: false center_shift: 100 interpolation_mode: "nearest" # bilinear or nearest supported first_stage_config: target: ldm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity cond_stage_config: target: ldm.modules.encoders.modules.FrozenCLIPEmbedder ================================================ FILE: diffusers_models.py ================================================ from diffusers import UNet2DConditionModel from diffusers.models.unet_2d_condition import UNet2DConditionOutput from diffusers.configuration_utils import register_to_config from typing import Any, Dict, List, Optional, Tuple, Union from pydantic import StrictInt, StrictFloat, StrictBool, StrictStr import torch import torch.utils.checkpoint import torch.nn.functional as F from ldm.modules.diffusionmodules.positional_encoding import SinusoidalPositionalEmbedding class UNet2DConditionPatchModel(UNet2DConditionModel): @register_to_config def __init__( self, sample_size: Optional[int] = None, in_channels: int = 4, out_channels: int = 4, center_input_sample: bool = False, flip_sin_to_cos: bool = True, freq_shift: int = 0, down_block_types: Tuple[str] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", up_block_types: Tuple[str] = ( "UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), only_cross_attention: Union[bool, Tuple[bool]] = False, block_out_channels: Tuple[int] = (320, 640, 1280, 1280), layers_per_block: int = 2, downsample_padding: int = 1, mid_block_scale_factor: float = 1, act_fn: str = "silu", norm_num_groups: Optional[int] = 32, norm_eps: float = 1e-5, cross_attention_dim: int = 1280, attention_head_dim: Union[int, Tuple[int]] = 8, dual_cross_attention: bool = False, use_linear_projection: bool = False, class_embed_type: Optional[str] = None, num_class_embeds: Optional[int] = None, upcast_attention: bool = False, resnet_time_scale_shift: str = "default", time_embedding_type: str = "positional", # fourier, positional timestep_post_act: Optional[str] = None, time_cond_proj_dim: Optional[int] = None, conv_in_kernel: int = 3, conv_out_kernel: int = 3, projection_class_embeddings_input_dim: Optional[int] = None, padding_idx: StrictInt = 0, init_size: StrictInt = 128, div_half_dim: StrictBool = False, center_shift: StrictInt = 64, interpolation_mode: StrictStr = "bilinear", ): super().__init__(sample_size=sample_size, in_channels=in_channels, out_channels=out_channels, center_input_sample=center_input_sample, flip_sin_to_cos=flip_sin_to_cos, freq_shift=freq_shift, down_block_types=down_block_types, mid_block_type=mid_block_type, up_block_types=up_block_types, only_cross_attention=only_cross_attention, block_out_channels=block_out_channels, layers_per_block=layers_per_block, downsample_padding=downsample_padding, mid_block_scale_factor=mid_block_scale_factor, act_fn=act_fn, norm_num_groups=norm_num_groups, norm_eps=norm_eps, cross_attention_dim=cross_attention_dim, attention_head_dim=attention_head_dim, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, class_embed_type=class_embed_type, num_class_embeds=num_class_embeds, upcast_attention=upcast_attention, resnet_time_scale_shift=resnet_time_scale_shift, time_embedding_type=time_embedding_type, # fourier, positional timestep_post_act=timestep_post_act, time_cond_proj_dim=time_cond_proj_dim, conv_in_kernel=conv_in_kernel, conv_out_kernel=conv_out_kernel, projection_class_embeddings_input_dim=projection_class_embeddings_input_dim, ) assert block_out_channels[0] % 2 == 0 self.head_position_encode = SinusoidalPositionalEmbedding(embedding_dim=block_out_channels[0]//2, padding_idx=padding_idx, init_size=init_size, div_half_dim=div_half_dim, center_shift=center_shift) self.init_size = init_size self.interpolation_mode = interpolation_mode def forward( self, sample: torch.FloatTensor, timestep: Union[torch.Tensor, float, int], encoder_hidden_states: torch.Tensor, crop_boxes: Optional[torch.Tensor] = None, class_labels: Optional[torch.Tensor] = None, timestep_cond: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, ) -> Union[UNet2DConditionOutput, Tuple]: # By default samples have to be AT least a multiple of the overall upsampling factor. # The overall upsampling factor is equal to 2 ** (# num of upsampling layears). # However, the upsampling interpolation output size can be forced to fit any upsampling size # on the fly if necessary. default_overall_up_factor = 2**self.num_upsamplers # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` forward_upsample_size = False upsample_size = None if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): forward_upsample_size = True # prepare attention_mask if attention_mask is not None: attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 attention_mask = attention_mask.unsqueeze(1) # 0. center input if necessary if self.config.center_input_sample: sample = 2 * sample - 1.0 # 1. time timesteps = timestep if not torch.is_tensor(timesteps): # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can # This would be a good case for the `match` statement (Python 3.10+) is_mps = sample.device.type == "mps" if isinstance(timestep, float): dtype = torch.float32 if is_mps else torch.float64 else: dtype = torch.int32 if is_mps else torch.int64 timesteps = torch.tensor( [timesteps], dtype=dtype, device=sample.device) elif len(timesteps.shape) == 0: timesteps = timesteps[None].to(sample.device) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML timesteps = timesteps.expand(sample.shape[0]) t_emb = self.time_proj(timesteps) # timesteps does not contain any weights and will always return f32 tensors # but time_embedding might actually be running in fp16. so we need to cast here. # there might be better ways to encapsulate this. t_emb = t_emb.to(dtype=self.dtype) emb = self.time_embedding(t_emb, timestep_cond) if self.class_embedding is not None: if class_labels is None: raise ValueError( "class_labels should be provided when num_class_embeds > 0") if self.config.class_embed_type == "timestep": class_labels = self.time_proj(class_labels) class_emb = self.class_embedding(class_labels).to(dtype=self.dtype) emb = emb + class_emb # 2. pre-process sample = self.conv_in(sample) head_grid = self.head_position_encode(torch.ones([sample.shape[0], sample.shape[1], self.init_size, self.init_size], dtype=self.dtype, device=sample.device)) if crop_boxes is not None: head_grid = torch.cat([F.interpolate(hg.unsqueeze(0)[:, :, box[0]: box[2], box[1]: box[3]], (sample.shape[2], sample.shape[3]), mode='bilinear', align_corners=True) for hg, box in zip(head_grid, crop_boxes)], dim=0) else: head_grid = F.interpolate( head_grid, (sample.shape[2], sample.shape[3]), mode='bilinear', align_corners=True) sample += head_grid # 3. down down_block_res_samples = (sample,) for downsample_block in self.down_blocks: if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: sample, res_samples = downsample_block( hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs, ) else: sample, res_samples = downsample_block( hidden_states=sample, temb=emb) down_block_res_samples += res_samples # 4. mid if self.mid_block is not None: sample = self.mid_block( sample, emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs, ) # 5. up for i, upsample_block in enumerate(self.up_blocks): is_final_block = i == len(self.up_blocks) - 1 res_samples = down_block_res_samples[-len(upsample_block.resnets):] down_block_res_samples = down_block_res_samples[: -len( upsample_block.resnets)] # if we have not reached the final block and need to forward the # upsample size, we do it here if not is_final_block and forward_upsample_size: upsample_size = down_block_res_samples[-1].shape[2:] if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: sample = upsample_block( hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, upsample_size=upsample_size, attention_mask=attention_mask, ) else: sample = upsample_block( hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size ) # 6. post-process if self.conv_norm_out: sample = self.conv_norm_out(sample) sample = self.conv_act(sample) sample = self.conv_out(sample) if not return_dict: return (sample,) return UNet2DConditionOutput(sample=sample) if __name__ == "__main__": unet = UNet2DConditionPatchModel.from_pretrained( "CompVis/stable-diffusion-v1-4", subfolder="unet", revision=None, low_cpu_mem_usage=False, device_map=None ) ================================================ FILE: diffusers_sample.py ================================================ from diffusers import StableDiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput import torch from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel from transformers import AutoTokenizer, PretrainedConfig from typing import Any, Callable, Dict, List, Optional, Union import importlib import os import diffusers from diffusers.models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT import argparse from accelerate.utils import ProjectConfiguration, set_seed def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): text_encoder_config = PretrainedConfig.from_pretrained( pretrained_model_name_or_path, subfolder="text_encoder", revision=revision, ) model_class = text_encoder_config.architectures[0] if model_class == "CLIPTextModel": from transformers import CLIPTextModel return CLIPTextModel elif model_class == "RobertaSeriesModelWithTransformation": from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation return RobertaSeriesModelWithTransformation else: raise ValueError(f"{model_class} is not supported.") class StableDiffusionGuidancePipeline(StableDiffusionPipeline): text_encoder_orig = None unet_orig = None def __init__( self, vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker, ): super().__init__(vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker,) self.config['unet'] = (unet.__module__, unet.config['_class_name']) def add_pretrained_model(self, text_encoder, unet): self.text_encoder_orig = text_encoder.to(self._execution_device) self.unet_orig = unet.to(self._execution_device) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): torch_dtype = kwargs.pop("torch_dtype", None) provider = kwargs.pop("provider", None) sess_options = kwargs.pop("sess_options", None) device_map = kwargs.pop("device_map", None) low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT) return_cached_folder = kwargs.pop("return_cached_folder", False) # 1. Download the checkpoints and configs # use snapshot download here to get it working from from_pretrained cached_folder = pretrained_model_name_or_path config_dict = cls.load_config(cached_folder) if config_dict['unet'][0] is None: config_dict['unet'][0] = 'diffusers_models' # 2. Load the pipeline class pipeline_class = cls # some modules can be passed directly to the init # in this case they are already instantiated in `kwargs` # extract them here expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class) passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs} passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs} init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs) # define init kwargs init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict} init_kwargs = {**init_kwargs, **passed_pipe_kwargs} # remove `null` components def load_module(name, value): if isinstance(value, bool): return False if value[0] is None: return False if name in passed_class_obj and passed_class_obj[name] is None: return False return True init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)} # import it here to avoid circular import from diffusers import pipelines # 3. Load each module in the pipeline for name, (library_name, class_name) in init_dict.items(): # 3.1 - now that JAX/Flax is an official framework of the library, we might load from Flax names if class_name.startswith("Flax"): class_name = class_name[4:] is_pipeline_module = hasattr(pipelines, library_name) loaded_sub_model = None # if the model is in a pipeline module, then we load it from the pipeline if name in passed_class_obj: # 1. check that passed_class_obj has correct parent class # pass # set passed class object loaded_sub_model = passed_class_obj[name] elif is_pipeline_module: pipeline_module = getattr(pipelines, library_name) class_obj = getattr(pipeline_module, class_name) else: # else we just import it from the library. # NOTE: here I reuse library_name as the module name library = importlib.import_module(library_name) class_obj = getattr(library, class_name) if loaded_sub_model is None: load_method_name = 'from_pretrained' load_method = getattr(class_obj, load_method_name) loading_kwargs = {} if issubclass(class_obj, torch.nn.Module): loading_kwargs["torch_dtype"] = torch_dtype # if issubclass(class_obj, diffusers.OnnxRuntimeModel): # loading_kwargs["provider"] = provider # loading_kwargs["sess_options"] = sess_options is_diffusers_model = issubclass(class_obj, diffusers.ModelMixin) # When loading a transformers model, if the device_map is None, the weights will be initialized as opposed to diffusers. # To make default loading faster we set the `low_cpu_mem_usage=low_cpu_mem_usage` flag which is `True` by default. # This makes sure that the weights won't be initialized which significantly speeds up loading. if is_diffusers_model: loading_kwargs["device_map"] = device_map loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage # check if the module is in a subdirectory if os.path.isdir(os.path.join(cached_folder, name)): loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs) else: # else load from the root directory loaded_sub_model = load_method(cached_folder, **loading_kwargs) init_kwargs[name] = loaded_sub_model # UNet(...), # DiffusionSchedule(...) init_kwargs['requires_safety_checker'] = False # 4. Potentially add passed objects if expected missing_modules = set(expected_modules) - set(init_kwargs.keys()) passed_modules = list(passed_class_obj.keys()) optional_modules = pipeline_class._optional_components if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules): for module in missing_modules: init_kwargs[module] = passed_class_obj.get(module, None) elif len(missing_modules) > 0: passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs raise ValueError( f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed." ) # 5. Instantiate the pipeline model = pipeline_class(**init_kwargs) if return_cached_folder: return model, cached_folder return model def _encode_prompt_orig( self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, ): r""" Encodes the prompt into text encoder hidden states. Args: prompt (`str` or `List[str]`, *optional*): prompt to be encoded device: (`torch.device`): torch device num_images_per_prompt (`int`): number of images that should be generated per prompt do_classifier_free_guidance (`bool`): whether to use classifier free guidance or not negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input argument. negative_prompt_embeds (`torch.FloatTensor`, *optional*): Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. """ if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) else: batch_size = prompt_embeds.shape[0] if prompt_embeds is None: text_inputs = self.tokenizer( prompt, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="pt", ) text_input_ids = text_inputs.input_ids untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( text_input_ids, untruncated_ids ): removed_text = self.tokenizer.batch_decode( untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] ) if hasattr(self.text_encoder_orig.config, "use_attention_mask") and self.text_encoder_orig.config.use_attention_mask: attention_mask = text_inputs.attention_mask.to(device) else: attention_mask = None prompt_embeds = self.text_encoder_orig( text_input_ids.to(device), attention_mask=attention_mask, ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_orig.dtype, device=device) bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: uncond_tokens: List[str] if negative_prompt is None: uncond_tokens = [""] * batch_size elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" f" {type(prompt)}." ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" " the batch size of `prompt`." ) else: uncond_tokens = negative_prompt max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt", ) if hasattr(self.text_encoder_orig.config, "use_attention_mask") and self.text_encoder_orig.config.use_attention_mask: attention_mask = uncond_input.attention_mask.to(device) else: attention_mask = None negative_prompt_embeds = self.text_encoder_orig( uncond_input.input_ids.to(device), attention_mask=attention_mask, ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_orig.dtype, device=device) negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @torch.no_grad() def __call__( self, prompt: Union[str, List[str]] = None, prompt_edit: Optional[Union[str, List[str]]] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, output_type: Optional[str] = "pil", return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, model_based_guidance_scale: float = 0.0, K_min: int = 400, ): # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor width = width or self.unet.config.sample_size * self.vae_scale_factor # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds ) self.check_inputs( prompt_edit, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): batch_size = 1 assert isinstance(prompt_edit, str) elif prompt is not None and isinstance(prompt, list): batch_size = len(prompt) assert isinstance(prompt_edit, list) assert len(prompt_edit) == len(prompt) else: batch_size = prompt_embeds.shape[0] device = self._execution_device # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt prompt_embeds = self._encode_prompt( prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, ) prompt_embdes_edit = self._encode_prompt_orig( prompt_edit, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, prompt_embeds=None, negative_prompt_embeds=None, ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) timesteps = self.scheduler.timesteps # 5. Prepare latent variables num_channels_latents = self.unet.in_channels latents = self.prepare_latents( batch_size * num_images_per_prompt, num_channels_latents, height, width, prompt_embeds.dtype, device, generator, latents, ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet_orig( latent_model_input, t, encoder_hidden_states=prompt_embdes_edit, cross_attention_kwargs=cross_attention_kwargs, ).sample if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) if t > K_min and model_based_guidance_scale > 0.0: noise_pred_guidance = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, ).sample if do_classifier_free_guidance: noise_pred_guidance_uncond, noise_pred_guidance_text = noise_pred_guidance.chunk(2) noise_pred_text = noise_pred_text * (1 - model_based_guidance_scale) + noise_pred_guidance_text * model_based_guidance_scale else: noise_pred = noise_pred * (1 - model_based_guidance_scale) + noise_pred_guidance * model_based_guidance_scale # perform guidance if do_classifier_free_guidance: noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) if output_type == "latent": image = latents elif output_type == "pil": # 8. Post-processing image = self.decode_latents(latents) # 10. Convert to PIL image = self.numpy_to_pil(image) else: # 8. Post-processing image = self.decode_latents(latents) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.final_offload_hook.offload() if not return_dict: return (image, None) return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None) def parse_args(input_args=None): parser = argparse.ArgumentParser( description="Simple example of a training script.") parser.add_argument( "--pretrained_model_name_or_path", type=str, default=None, required=True, help="Path to pretrained model or model identifier from huggingface.co/models.", ) parser.add_argument( '--prompt', type=str, default=None, required=True, help='Text prompt for the fine-tuned model.' ) parser.add_argument( '--editing_prompt', type=str, default=None, required=True, help='Text prompt for the pre-trained model.' ) parser.add_argument("--seed", type=int, default=412441, help="A seed for reproducible sampling.") parser.add_argument('--num_images_per_prompt', type=int, default=2, help='Batch size.') parser.add_argument('--num_iterations', type=int, default=1,) parser.add_argument('--model_based_guidance_scale', type=float, default=0.3, help='Scale of model-based guidance.') parser.add_argument('--guidance_scale', type=float, default=7.5, help='Scale of classifier-free guidance.') parser.add_argument('--K', default=400, type=int, help='step to stop guidance') parser.add_argument('--ddim_steps', default=100, type=int, help='Number of ddim steps') parser.add_argument('--height', default=512, type=int, help='Height of the image') parser.add_argument('--width', default=512, type=int, help='Width of the image') if input_args is not None: args = parser.parse_args(input_args) else: args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() set_seed(args.seed) pretrained_model_name_or_path = "CompVis/stable-diffusion-v1-4" text_encoder_cls = import_model_class_from_model_name_or_path(pretrained_model_name_or_path, None) text_encoder = text_encoder_cls.from_pretrained( pretrained_model_name_or_path, subfolder="text_encoder", revision=None ) unet = UNet2DConditionModel.from_pretrained( pretrained_model_name_or_path, subfolder="unet", revision=None ) model_id = args.pretrained_model_name_or_path pipe = StableDiffusionGuidancePipeline.from_pretrained(model_id, torch_dtype=torch.float).to("cuda") pipe.add_pretrained_model(text_encoder=text_encoder, unet=unet) prompt = args.prompt prompt_edit = args.editing_prompt file_name = (prompt + '[SEP]' + prompt_edit).replace(' ', '_') for i in range(args.num_iterations): images = pipe(prompt=prompt, prompt_edit=prompt_edit, model_based_guidance_scale=args.model_based_guidance_scale, K_min=args.K, num_inference_steps=args.ddim_steps, guidance_scale=args.guidance_scale, num_images_per_prompt=args.num_images_per_prompt, height=args.height, width=args.width).images for j, image in enumerate(images): image.save("{}/{}_{}.png".format(args.pretrained_model_name_or_path, file_name, i * args.num_images_per_prompt + j)) ================================================ FILE: diffusers_train.py ================================================ #!/usr/bin/env python # coding=utf-8 # Copyright 2022 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and import argparse import hashlib import itertools import logging import math import os import warnings from pathlib import Path from typing import Optional import random from copy import deepcopy import accelerate import torch import torch.nn.functional as F import torch.utils.checkpoint import transformers from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from packaging import version from PIL import Image from torch.utils.data import Dataset from torchvision import transforms from tqdm.auto import tqdm from transformers import AutoTokenizer, PretrainedConfig import numpy as np import diffusers from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel from diffusers.optimization import get_scheduler from diffusers.utils import check_min_version from diffusers.utils.import_utils import is_xformers_available from diffusers_models import UNet2DConditionPatchModel logger = get_logger(__name__) def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): text_encoder_config = PretrainedConfig.from_pretrained( pretrained_model_name_or_path, subfolder="text_encoder", revision=revision, ) model_class = text_encoder_config.architectures[0] if model_class == "CLIPTextModel": from transformers import CLIPTextModel return CLIPTextModel elif model_class == "RobertaSeriesModelWithTransformation": from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation return RobertaSeriesModelWithTransformation else: raise ValueError(f"{model_class} is not supported.") def parse_args(input_args=None): parser = argparse.ArgumentParser( description="Simple example of a training script.") parser.add_argument( "--pretrained_model_name_or_path", type=str, default=None, required=True, help="Path to pretrained model or model identifier from huggingface.co/models.", ) parser.add_argument( "--revision", type=str, default=None, required=False, help=( "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be" " float32 precision." ), ) parser.add_argument( "--tokenizer_name", type=str, default=None, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--img_path", type=str, default=None, required=True, ) parser.add_argument( "--instance_prompt", type=str, default=None, required=True, help="The prompt with identifier specifying the instance", ) parser.add_argument( "--output_dir", type=str, default="text-inversion-model", help="The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument("--seed", type=int, default=412441, help="A seed for reproducible training.") parser.add_argument( "--resolution", type=int, default=512, help=( "The resolution for input images, all the images in the train/validation dataset will be resized to this" " resolution" ), ) parser.add_argument( "--train_text_encoder", action="store_true", help="Whether to train the text encoder. If set, the text encoder should be float32 precision.", ) parser.add_argument( "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." ) parser.add_argument( "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." ) parser.add_argument("--num_train_epochs", type=int, default=1) parser.add_argument( "--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides num_train_epochs.", ) parser.add_argument( "--checkpointing_steps", type=int, default=500, help=( "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. " "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference." "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components." "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step" "instructions." ), ) parser.add_argument( "--checkpoints_total_limit", type=int, default=None, help=( "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" " for more details" ), ) parser.add_argument( "--resume_from_checkpoint", type=str, default=None, help=( "Whether training should be resumed from a previous checkpoint. Use a path saved by" ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' ), ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--gradient_checkpointing", action="store_true", help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", ) parser.add_argument( "--learning_rate", type=float, default=5e-6, help="Initial learning rate (after the potential warmup period) to use.", ) parser.add_argument( "--scale_lr", action="store_true", default=False, help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", ) parser.add_argument( "--lr_scheduler", type=str, default="constant", help=( 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' ' "constant", "constant_with_warmup"]' ), ) parser.add_argument( "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." ) parser.add_argument( "--lr_num_cycles", type=int, default=1, help="Number of hard resets of the lr in cosine_with_restarts scheduler.", ) parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.") parser.add_argument( "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." ) parser.add_argument( "--dataloader_num_workers", type=int, default=0, help=( "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." ), ) parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") parser.add_argument( "--hub_model_id", type=str, default=None, help="The name of the repository to keep in sync with the local `output_dir`.", ) parser.add_argument( "--logging_dir", type=str, default="logs", help=( "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." ), ) parser.add_argument( "--allow_tf32", action="store_true", help=( "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" ), ) parser.add_argument( "--report_to", type=str, default="tensorboard", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' ), ) parser.add_argument( "--mixed_precision", type=str, default=None, choices=["no", "fp16", "bf16"], help=( "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." ), ) parser.add_argument( "--prior_generation_precision", type=str, default=None, choices=["no", "fp32", "fp16", "bf16"], help=( "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" " 1.10.and an Nvidia Ampere GPU. Default to fp16 if a GPU is available else fp32." ), ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) parser.add_argument( "--set_grads_to_none", action="store_true", help=( "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain" " behaviors, so disable this argument if it causes any problems. More info:" " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html" ), ) parser.add_argument( "--patch_based_training", action="store_true", help=( "activate patch based training" ), ) parser.add_argument( '--high_res', type=int, default=1024, help='the highest resolution provided to the dataloader' ) parser.add_argument( '--latent_scale', type=int, default=8, help='the scale of the latent space' ) parser.add_argument( '--min_crop_frac', type=float, default=0.1, help='the minimum fraction of the image to crop' ) parser.add_argument( '--max_crop_frac', type=float, default=1, help='the maximum fraction of the image to crop' ) parser.add_argument( '--rec_prob', type=float, default=0.1, help='the probability of using the whole image as the crop' ) if input_args is not None: args = parser.parse_args(input_args) else: args = parser.parse_args() env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) if env_local_rank != -1 and env_local_rank != args.local_rank: args.local_rank = env_local_rank return args class SINEDatasetPatch(Dataset): def __init__( self, img_path, instance_prompt, tokenizer, size=512, high_res=1024, min_crop_frac=0.1, max_crop_frac=1, rec_prob=0.1, latent_scale=8, ): super().__init__() self.size = size self.tokenizer = tokenizer self.img_path = Path(img_path) if not self.img_path.exists(): raise ValueError(f"Image {self.img_path} doesn't exists.") self.num_instance_images = 1 self.instance_prompt = instance_prompt self._length = self.num_instance_images self.image = Image.open(img_path) if not self.image.mode == "RGB": self.image = self.image.convert("RGB") self.image = self.image.resize((high_res, high_res), resample=Image.Resampling.BICUBIC) self.instance_prompt_ids = self.tokenizer( self.instance_prompt, truncation=True, padding="max_length", max_length=self.tokenizer.model_max_length, return_tensors="pt", ).input_ids self.high_res = high_res self.min_crop_frac = min_crop_frac self.max_crop_frac = max_crop_frac self.rec_prob = rec_prob self.latent_scale = latent_scale self.image_transform = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize([0.5], [0.5]), ] ) def __len__(self): return self._length * 20 def _random_crop(self, pil_image): patch_size_y = int( (self.high_res//self.latent_scale) * (random.random() * (self.max_crop_frac - self.min_crop_frac) + self.min_crop_frac)) patch_size_x = int( (self.high_res//self.latent_scale) * (random.random() * (self.max_crop_frac - self.min_crop_frac) + self.min_crop_frac)) crop_y = random.randrange((self.high_res//self.latent_scale) - patch_size_y + 1) crop_x = random.randrange((self.high_res//self.latent_scale) - patch_size_x + 1) return pil_image.crop((crop_x * self.latent_scale, crop_y * self.latent_scale, (crop_x + patch_size_x) * self.latent_scale, (crop_y + patch_size_y) * self.latent_scale)).resize( (self.size, self.size), resample=Image.Resampling.BICUBIC), crop_y, crop_x, crop_y + patch_size_y, crop_x + patch_size_x def __getitem__(self, i): example = {} image = deepcopy(self.image) if random.random() > self.rec_prob: image, crop_y, crop_x, crop_y1, crop_x1 = self._random_crop(image) crop_area = torch.tensor([crop_y, crop_x, crop_y1, crop_x1]) else: image = image.resize((self.size, self.size), resample=Image.Resampling.BICUBIC ) crop_area = torch.tensor([0, 0, self.high_res//self.latent_scale, self.high_res//self.latent_scale]) example = { "instance_images": self.image_transform(image), "instance_prompt_ids": self.instance_prompt_ids, "crop_area": crop_area, } return example class SINEDatasetSingleRes(Dataset): def __init__( self, img_path, instance_prompt, tokenizer, size=512, ): super().__init__() self.size = size self.tokenizer = tokenizer self.img_path = Path(img_path) if not self.img_path.exists(): raise ValueError(f"Image {self.img_path} doesn't exists.") self.num_instance_images = 1 self.instance_prompt = instance_prompt self._length = self.num_instance_images self.image = Image.open(img_path) if not self.image.mode == "RGB": self.image = self.image.convert("RGB") self.image_transform = transforms.Compose( [ transforms.Resize( (size, size), interpolation=transforms.InterpolationMode.BICUBIC), transforms.ToTensor(), transforms.Normalize([0.5], [0.5]), ] ) self.image = self.image_transform(self.image) self.instance_prompt_ids = self.tokenizer( self.instance_prompt, truncation=True, padding="max_length", max_length=self.tokenizer.model_max_length, return_tensors="pt", ).input_ids def __len__(self): return self._length * 20 def __getitem__(self, index): example = { "instance_images": self.image, "instance_prompt_ids": self.instance_prompt_ids, } return example def collate_fn(examples): input_ids = [example["instance_prompt_ids"] for example in examples] pixel_values = [example["instance_images"] for example in examples] # Concat class and instance examples for prior preservation. # We do this to avoid doing two forward passes. pixel_values = torch.stack(pixel_values) pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() input_ids = torch.cat(input_ids, dim=0) if "crop_area" in examples[0]: crop_area = [example["crop_area"] for example in examples] crop_area = torch.stack(crop_area) # crop_area = crop_area.to(memory_format=torch.contiguous_format).float() batch = { "input_ids": input_ids, "pixel_values": pixel_values, "crop_area": crop_area, } return batch batch = { "input_ids": input_ids, "pixel_values": pixel_values, } return batch def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): if token is None: token = HfFolder.get_token() if organization is None: username = whoami(token)["name"] return f"{username}/{model_id}" else: return f"{organization}/{model_id}" def main(args): logging_dir = Path(args.output_dir, args.logging_dir) accelerator_project_config = ProjectConfiguration( total_limit=args.checkpoints_total_limit) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, project_config=accelerator_project_config, ) # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models. # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate. if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1: raise ValueError( "Gradient accumulation is not supported when training the text encoder in distributed training. " "Please set gradient_accumulation_steps to 1. This feature will be supported in the future." ) # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: transformers.utils.logging.set_verbosity_warning() diffusers.utils.logging.set_verbosity_info() else: transformers.utils.logging.set_verbosity_error() diffusers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name( Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id create_repo(repo_name, exist_ok=True, token=args.hub_token) repo = Repository( args.output_dir, clone_from=repo_name, token=args.hub_token) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) # Load the tokenizer if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, revision=args.revision, use_fast=False) elif args.pretrained_model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False, ) # import correct text encoder class text_encoder_cls = import_model_class_from_model_name_or_path( args.pretrained_model_name_or_path, args.revision) # Load scheduler and models noise_scheduler = DDPMScheduler.from_pretrained( args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder = text_encoder_cls.from_pretrained( args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision ) vae = AutoencoderKL.from_pretrained( args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) if args.patch_based_training: unet = UNet2DConditionPatchModel.from_pretrained( args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, low_cpu_mem_usage=False, device_map=None ) else: unet = UNet2DConditionModel.from_pretrained( args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision ) # `accelerate` 0.16.0 will have better support for customized saving if version.parse(accelerate.__version__) >= version.parse("0.16.0"): # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format def save_model_hook(models, weights, output_dir): for model in models: sub_dir = "unet" if type(model) == type( unet) else "text_encoder" model.save_pretrained(os.path.join(output_dir, sub_dir)) # make sure to pop weight so that corresponding model is not saved again weights.pop() def load_model_hook(models, input_dir): while len(models) > 0: # pop models so that they are not loaded again model = models.pop() if type(model) == type(text_encoder): # load transformers style into model load_model = text_encoder_cls.from_pretrained( input_dir, subfolder="text_encoder") model.config = load_model.config else: # load diffusers style into model load_model = UNet2DConditionModel.from_pretrained( input_dir, subfolder="unet") model.register_to_config(**load_model.config) model.load_state_dict(load_model.state_dict()) del load_model accelerator.register_save_state_pre_hook(save_model_hook) accelerator.register_load_state_pre_hook(load_model_hook) vae.requires_grad_(False) if not args.train_text_encoder: text_encoder.requires_grad_(False) if args.enable_xformers_memory_efficient_attention: if is_xformers_available(): import xformers xformers_version = version.parse(xformers.__version__) if xformers_version == version.parse("0.0.16"): logger.warn( "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." ) unet.enable_xformers_memory_efficient_attention() else: raise ValueError( "xformers is not available. Make sure it is installed correctly") if args.gradient_checkpointing: unet.enable_gradient_checkpointing() if args.train_text_encoder: text_encoder.gradient_checkpointing_enable() # Check that all trainable models are in full precision low_precision_error_string = ( "Please make sure to always have all model weights in full float32 precision when starting training - even if" " doing mixed precision training. copy of the weights should still be float32." ) if accelerator.unwrap_model(unet).dtype != torch.float32: raise ValueError( f"Unet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}" ) if args.train_text_encoder and accelerator.unwrap_model(text_encoder).dtype != torch.float32: raise ValueError( f"Text encoder loaded as datatype {accelerator.unwrap_model(text_encoder).dtype}." f" {low_precision_error_string}" ) # Enable TF32 for faster training on Ampere GPUs, # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices if args.allow_tf32: torch.backends.cuda.matmul.allow_tf32 = True if args.scale_lr: args.learning_rate = ( args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes ) # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs if args.use_8bit_adam: try: import bitsandbytes as bnb except ImportError: raise ImportError( "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." ) optimizer_class = bnb.optim.AdamW8bit else: optimizer_class = torch.optim.AdamW # Optimizer creation params_to_optimize = ( itertools.chain(unet.parameters(), text_encoder.parameters( )) if args.train_text_encoder else unet.parameters() ) optimizer = optimizer_class( params_to_optimize, lr=args.learning_rate, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, eps=args.adam_epsilon, ) # Dataset and DataLoaders creation: train_dataset = SINEDatasetSingleRes( img_path=args.img_path, instance_prompt=args.instance_prompt, tokenizer=tokenizer, size=args.resolution, ) if not args.patch_based_training else SINEDatasetPatch( img_path=args.img_path, instance_prompt=args.instance_prompt, tokenizer=tokenizer, size=args.resolution, high_res=args.high_res, min_crop_frac=args.min_crop_frac, max_crop_frac=args.max_crop_frac, rec_prob=args.rec_prob, latent_scale=args.latent_scale, ) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, shuffle=True, collate_fn=lambda examples: collate_fn(examples), num_workers=args.dataloader_num_workers, ) # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True lr_scheduler = get_scheduler( args.lr_scheduler, optimizer=optimizer, num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, num_cycles=args.lr_num_cycles, power=args.lr_power, ) # Prepare everything with our `accelerator`. if args.train_text_encoder: unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( unet, text_encoder, optimizer, train_dataloader, lr_scheduler ) else: unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( unet, optimizer, train_dataloader, lr_scheduler ) # For mixed precision training we cast the text_encoder and vae weights to half-precision # as these models are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": weight_dtype = torch.float16 elif accelerator.mixed_precision == "bf16": weight_dtype = torch.bfloat16 # Move vae and text_encoder to device and cast to weight_dtype vae.to(accelerator.device, dtype=weight_dtype) if not args.train_text_encoder: text_encoder.to(accelerator.device, dtype=weight_dtype) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil( args.max_train_steps / num_update_steps_per_epoch) # We need to initialize the trackers we use, and also store our configuration. # The trackers initializes automatically on the main process. if accelerator.is_main_process: accelerator.init_trackers("dreambooth", config=vars(args)) # Train! total_batch_size = args.train_batch_size * \ accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num batches each epoch = {len(train_dataloader)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.train_batch_size}") logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") global_step = 0 first_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint != "latest": path = os.path.basename(args.resume_from_checkpoint) else: # Get the mos recent checkpoint dirs = os.listdir(args.output_dir) dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) path = dirs[-1] if len(dirs) > 0 else None if path is None: accelerator.print( f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run." ) args.resume_from_checkpoint = None else: accelerator.print(f"Resuming from checkpoint {path}") accelerator.load_state(os.path.join(args.output_dir, path)) global_step = int(path.split("-")[1]) resume_global_step = global_step * args.gradient_accumulation_steps first_epoch = global_step // num_update_steps_per_epoch resume_step = resume_global_step % ( num_update_steps_per_epoch * args.gradient_accumulation_steps) # Only show the progress bar once on each machine. progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) progress_bar.set_description("Steps") for epoch in range(first_epoch, args.num_train_epochs): unet.train() if args.train_text_encoder: text_encoder.train() for step, batch in enumerate(train_dataloader): # Skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: if step % args.gradient_accumulation_steps == 0: progress_bar.update(1) continue with accelerator.accumulate(unet): # Convert images to latent space latents = vae.encode(batch["pixel_values"].to( dtype=weight_dtype)).latent_dist.sample() latents = latents * vae.config.scaling_factor # Sample noise that we'll add to the latents noise = torch.randn_like(latents) bsz = latents.shape[0] # Sample a random timestep for each image timesteps = torch.randint( 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) timesteps = timesteps.long() # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) noisy_latents = noise_scheduler.add_noise( latents, noise, timesteps) # Get the text embedding for conditioning encoder_hidden_states = text_encoder(batch["input_ids"])[0] # Predict the noise residual model_pred = unet(noisy_latents, timesteps,encoder_hidden_states).sample if not args.patch_based_training else \ unet(noisy_latents, timesteps,encoder_hidden_states, crop_boxes=batch['crop_area']).sample # Get the target for loss depending on the prediction type if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": target = noise_scheduler.get_velocity( latents, noise, timesteps) else: raise ValueError( f"Unknown prediction type {noise_scheduler.config.prediction_type}") loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") accelerator.backward(loss) if accelerator.sync_gradients: params_to_clip = ( itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters() ) accelerator.clip_grad_norm_( params_to_clip, args.max_grad_norm) optimizer.step() lr_scheduler.step() optimizer.zero_grad(set_to_none=args.set_grads_to_none) # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: progress_bar.update(1) global_step += 1 if global_step % args.checkpointing_steps == 0: if accelerator.is_main_process: save_path = os.path.join( args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") logs = {"loss": loss.detach().item( ), "lr": lr_scheduler.get_last_lr()[0]} progress_bar.set_postfix(**logs) accelerator.log(logs, step=global_step) if global_step >= args.max_train_steps: break # Create the pipeline using using the trained modules and save it. accelerator.wait_for_everyone() if accelerator.is_main_process: pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, unet=accelerator.unwrap_model(unet), text_encoder=accelerator.unwrap_model(text_encoder), revision=args.revision, ) pipeline.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True) accelerator.end_training() if __name__ == "__main__": args = parse_args() main(args) ================================================ FILE: environment.yml ================================================ name: SINE channels: - defaults dependencies: - _libgcc_mutex=0.1=main - _openmp_mutex=5.1=1_gnu - ca-certificates=2022.10.11=h06a4308_0 - certifi=2022.9.24=py38h06a4308_0 - libedit=3.1.20210910=h7f8727e_0 - libffi=3.2.1=hf484d3e_1007 - libgcc-ng=11.2.0=h1234567_1 - libgomp=11.2.0=h1234567_1 - libstdcxx-ng=11.2.0=h1234567_1 - ncurses=6.3=h5eee18b_3 - openssl=1.1.1s=h7f8727e_0 - pip=22.2.2=py38h06a4308_0 - python=3.8.0=h0371630_2 - readline=7.0=h7b6447c_5 - sqlite=3.33.0=h62c20be_0 - tk=8.6.12=h1ccaba5_0 - wheel=0.37.1=pyhd3eb1b0_0 - xz=5.2.8=h5eee18b_0 - zlib=1.2.13=h5eee18b_0 - pip: - absl-py==1.3.0 - accelerate==0.16.0 - aiohttp==3.8.3 - aiosignal==1.3.1 - albumentations==1.1.0 - antlr4-python3-runtime==4.8 - asttokens==2.2.1 - async-timeout==4.0.2 - attrs==22.1.0 - autopep8==2.0.0 - backcall==0.2.0 - cachetools==5.2.0 - charset-normalizer==2.1.1 - debugpy==1.6.4 - decorator==5.1.1 - einops==0.6.0 - entrypoints==0.4 - executing==1.2.0 - filelock==3.8.2 - frozenlist==1.3.3 - fsspec==2022.11.0 - ftfy==6.1.1 - future==0.18.2 - google-auth==2.15.0 - google-auth-oauthlib==0.4.6 - grpcio==1.51.1 - huggingface-hub==0.11.1 - idna==3.4 - imageio==2.14.1 - imageio-ffmpeg==0.4.7 - importlib-metadata==5.1.0 - ipdb==0.13.9 - ipykernel==6.17.1 - ipython==8.7.0 - jedi==0.18.2 - jinja2==3.1.2 - joblib==1.2.0 - jupyter-client==7.4.8 - jupyter-core==5.1.0 - kornia==0.6.0 - markdown==3.4.1 - markupsafe==2.1.1 - matplotlib-inline==0.1.6 - multidict==6.0.3 - nest-asyncio==1.5.6 - networkx==2.8.8 - numpy==1.23.5 - oauthlib==3.2.2 - omegaconf==2.1.1 - opencv-python==4.2.0.34 - opencv-python-headless==4.6.0.66 - packaging==21.3 - pandas==1.5.2 - parso==0.8.3 - pexpect==4.8.0 - pickleshare==0.7.5 - pillow==9.3.0 - platformdirs==2.6.0 - prompt-toolkit==3.0.36 - protobuf==3.20.3 - psutil==5.9.4 - ptyprocess==0.7.0 - pudb==2019.2 - pure-eval==0.2.2 - pyasn1==0.4.8 - pyasn1-modules==0.2.8 - pycodestyle==2.10.0 - pydantic==1.10.2 - pydeprecate==0.3.1 - pygments==2.13.0 - pyparsing==3.0.9 - python-dateutil==2.8.2 - pytorch-lightning==1.5.9 - pytz==2022.6 - pywavelets==1.4.1 - pyyaml==6.0 - pyzmq==24.0.1 - qudida==0.0.4 - regex==2022.10.31 - requests==2.28.1 - requests-oauthlib==1.3.1 - rsa==4.9 - scikit-image==0.19.3 - scikit-learn==1.1.3 - scipy==1.9.3 - setuptools==59.5.0 - six==1.16.0 - stack-data==0.6.2 - tensorboard==2.11.0 - tensorboard-data-server==0.6.1 - tensorboard-plugin-wit==1.8.1 - test-tube==0.7.5 - threadpoolctl==3.1.0 - tifffile==2022.10.10 - tokenizers==0.13.2 - toml==0.10.2 - tomli==2.0.1 - torch==1.11.0+cu113 - torchmetrics==0.11.0 - torchvision==0.12.0+cu113 - tornado==6.2 - tqdm==4.64.1 - traitlets==5.6.0 - transformers==4.25.1 - typing-extensions==4.5.0 - urllib3==1.26.13 - urwid==2.1.2 - wcwidth==0.2.5 - werkzeug==2.2.2 - yarl==1.8.2 - zipp==3.11.0 ================================================ FILE: ldm/data/__init__.py ================================================ ================================================ FILE: ldm/data/base.py ================================================ from abc import abstractmethod from torch.utils.data import Dataset, ConcatDataset, ChainDataset, IterableDataset class Txt2ImgIterableBaseDataset(IterableDataset): ''' Define an interface to make the IterableDatasets for text2img data chainable ''' def __init__(self, num_records=0, valid_ids=None, size=256): super().__init__() self.num_records = num_records self.valid_ids = valid_ids self.sample_ids = valid_ids self.size = size print(f'{self.__class__.__name__} dataset contains {self.__len__()} examples.') def __len__(self): return self.num_records @abstractmethod def __iter__(self): pass ================================================ FILE: ldm/data/personalized.py ================================================ import os import numpy as np import re import PIL from PIL import Image from torch.utils.data import Dataset from torchvision import transforms from copy import deepcopy import random import torch training_templates_smallest = [ 'photo of a sks {}', ] reg_templates_smallest = [ 'photo of a {}', ] imagenet_templates_small = [ 'a photo of a {}', 'a rendering of a {}', 'a cropped photo of the {}', 'the photo of a {}', 'a photo of a clean {}', 'a photo of a dirty {}', 'a dark photo of the {}', 'a photo of my {}', 'a photo of the cool {}', 'a close-up photo of a {}', 'a bright photo of the {}', 'a cropped photo of a {}', 'a photo of the {}', 'a good photo of the {}', 'a photo of one {}', 'a close-up photo of the {}', 'a rendition of the {}', 'a photo of the clean {}', 'a rendition of a {}', 'a photo of a nice {}', 'a good photo of a {}', 'a photo of the nice {}', 'a photo of the small {}', 'a photo of the weird {}', 'a photo of the large {}', 'a photo of a cool {}', 'a photo of a small {}', 'an illustration of a {}', 'a rendering of a {}', 'a cropped photo of the {}', 'the photo of a {}', 'an illustration of a clean {}', 'an illustration of a dirty {}', 'a dark photo of the {}', 'an illustration of my {}', 'an illustration of the cool {}', 'a close-up photo of a {}', 'a bright photo of the {}', 'a cropped photo of a {}', 'an illustration of the {}', 'a good photo of the {}', 'an illustration of one {}', 'a close-up photo of the {}', 'a rendition of the {}', 'an illustration of the clean {}', 'a rendition of a {}', 'an illustration of a nice {}', 'a good photo of a {}', 'an illustration of the nice {}', 'an illustration of the small {}', 'an illustration of the weird {}', 'an illustration of the large {}', 'an illustration of a cool {}', 'an illustration of a small {}', 'a depiction of a {}', 'a rendering of a {}', 'a cropped photo of the {}', 'the photo of a {}', 'a depiction of a clean {}', 'a depiction of a dirty {}', 'a dark photo of the {}', 'a depiction of my {}', 'a depiction of the cool {}', 'a close-up photo of a {}', 'a bright photo of the {}', 'a cropped photo of a {}', 'a depiction of the {}', 'a good photo of the {}', 'a depiction of one {}', 'a close-up photo of the {}', 'a rendition of the {}', 'a depiction of the clean {}', 'a rendition of a {}', 'a depiction of a nice {}', 'a good photo of a {}', 'a depiction of the nice {}', 'a depiction of the small {}', 'a depiction of the weird {}', 'a depiction of the large {}', 'a depiction of a cool {}', 'a depiction of a small {}', ] imagenet_dual_templates_small = [ 'a photo of a {} with {}', 'a rendering of a {} with {}', 'a cropped photo of the {} with {}', 'the photo of a {} with {}', 'a photo of a clean {} with {}', 'a photo of a dirty {} with {}', 'a dark photo of the {} with {}', 'a photo of my {} with {}', 'a photo of the cool {} with {}', 'a close-up photo of a {} with {}', 'a bright photo of the {} with {}', 'a cropped photo of a {} with {}', 'a photo of the {} with {}', 'a good photo of the {} with {}', 'a photo of one {} with {}', 'a close-up photo of the {} with {}', 'a rendition of the {} with {}', 'a photo of the clean {} with {}', 'a rendition of a {} with {}', 'a photo of a nice {} with {}', 'a good photo of a {} with {}', 'a photo of the nice {} with {}', 'a photo of the small {} with {}', 'a photo of the weird {} with {}', 'a photo of the large {} with {}', 'a photo of a cool {} with {}', 'a photo of a small {} with {}', ] reg_templates_smallest = [ 'photo of a {}', ] reg_templates_no_class_smallest = [ 'a photo', ] reg_templates_no_class_small = [ 'a photo', 'a rendering', 'a cropped photo', 'the photo', 'a dark photo', 'a close-up photo', 'a bright photo', 'a cropped photo', 'a good photo', 'a rendition', 'an illustration', 'a depiction', ] per_img_token_list = [ 'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'כ', 'ל', 'מ', 'נ', 'ס', 'ע', 'פ', 'צ', 'ק', 'ר', 'ש', 'ת', ] class PersonalizedBase(Dataset): def __init__(self, data_root, size=None, repeats=100, interpolation="bicubic", flip_p=0.5, set="train", placeholder_token="dog", per_image_tokens=False, center_crop=False, mixing_prob=0.25, coarse_class_text=None, reg = False ): self.data_root = data_root self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)] # self._length = len(self.image_paths) self.num_images = len(self.image_paths) self._length = self.num_images self.placeholder_token = placeholder_token self.per_image_tokens = per_image_tokens self.center_crop = center_crop self.mixing_prob = mixing_prob self.coarse_class_text = coarse_class_text if per_image_tokens: assert self.num_images < len(per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'." if set == "train": self._length = self.num_images * repeats self.size = size self.interpolation = {"linear": PIL.Image.LINEAR, "bilinear": PIL.Image.BILINEAR, "bicubic": PIL.Image.BICUBIC, "lanczos": PIL.Image.LANCZOS, }[interpolation] self.flip = transforms.RandomHorizontalFlip(p=flip_p) self.reg = reg def __len__(self): return self._length def __getitem__(self, i): example = {} image = Image.open(self.image_paths[i % self.num_images]) if not image.mode == "RGB": image = image.convert("RGB") placeholder_string = self.placeholder_token if self.coarse_class_text: placeholder_string = f"{self.coarse_class_text} {placeholder_string}" if not self.reg: text = random.choice(training_templates_smallest).format(placeholder_string) else: text = random.choice(reg_templates_smallest).format(placeholder_string) example["caption"] = text # default to score-sde preprocessing img = np.array(image).astype(np.uint8) if self.center_crop: crop = min(img.shape[0], img.shape[1]) h, w, = img.shape[0], img.shape[1] img = img[(h - crop) // 2:(h + crop) // 2, (w - crop) // 2:(w + crop) // 2] image = Image.fromarray(img) if self.size is not None: image = image.resize((self.size, self.size), resample=self.interpolation) image = self.flip(image) image = np.array(image).astype(np.uint8) example["image"] = (image / 127.5 - 1.0).astype(np.float32) return example def crop_image(img, size=512, cropping='random', crop_scale=[1, 1]): if cropping in ['random', 'random_long_edge']: h, w, = img.shape[0], img.shape[1] crop = min(h, w) crop = int(torch.empty(1).uniform_(crop_scale[0], crop_scale[1]).item() * crop) offset_h = np.random.randint(0, h - crop + 1) offset_w = np.random.randint(0, w - crop + 1) img = img[offset_h:offset_h + crop, offset_w:offset_w + crop] elif cropping in ['center', 'center_long_edge']: h, w, = img.shape[0], img.shape[1] crop = min(h, w) crop = int(torch.empty(1).uniform_(crop_scale[0], crop_scale[1]).item() * crop) img = img[(h - crop) // 2:(h + crop) // 2, (w - crop) // 2:(w + crop) // 2] elif cropping == 'crop': h, w, = img.shape[0], img.shape[1] crop = min(h, w, size) crop = int(torch.empty(1).uniform_(crop_scale[0], crop_scale[1]).item() * crop) offset_h = np.random.randint(0, h - crop + 1) offset_w = np.random.randint(0, w - crop + 1) img = img[offset_h:offset_h + crop, offset_w:offset_w + crop] else: raise NotImplementedError return img class PersonalizedMulti(Dataset): def __init__( self, data_root="", size=None, repeats=100, interpolation="lanczos", flip_p=0.5, which_set="train", placeholder_token="sks", per_image_tokens=False, cropping='random_long_edge', crop_scale=[1, 1], mixing_prob=0.25, coarse_class_text=None, reg=False, use_small_template=False, delimiters = ",|:|;", **kwargs, ): # NOTE: split str to list data_root = re.split(delimiters, data_root) placeholder_token = re.split(delimiters, placeholder_token) # import ipdb # ipdb.set_trace() if coarse_class_text: coarse_class_text = re.split(delimiters, coarse_class_text) coarse_class_text = [(None if (s in ['none', 'None', 'null', 'Null']) else s) for s in coarse_class_text] else: coarse_class_text = [None] * len(data_root) assert len(placeholder_token) == len(data_root) == len(coarse_class_text) self.keys = placeholder_token self.data_root = {k: v for k, v in zip(self.keys, data_root)} self.placeholder_token = {k: v for k, v in zip(self.keys, placeholder_token)} self.coarse_class_text = {k: v for k, v in zip(self.keys, coarse_class_text)} self.image_paths = {k: [os.path.join(self.data_root[k], file_path) for file_path in os.listdir(self.data_root[k])] if os.path.isdir(self.data_root[k])else [self.data_root[k]] for k in self.keys} self.num_images = {k: len(self.image_paths[k]) for k in self.keys} self._length = max([self.num_images[k] for k in self.keys]) if which_set == "train": self._length = self._length * repeats self.reg = reg # self.per_image_tokens = per_image_tokens self.cropping = cropping self.crop_scale = crop_scale self.mixing_prob = mixing_prob self.use_small_template = use_small_template self.templates = { k: self.setup_templates( placeholder_token=self.placeholder_token[k], coarse_class_text=self.coarse_class_text[k], reg=reg, use_small_template=use_small_template, ) for k in self.keys } self.size = size self.interpolation = { "linear": PIL.Image.LINEAR, "bilinear": PIL.Image.BILINEAR, "bicubic": PIL.Image.BICUBIC, "lanczos": PIL.Image.LANCZOS, }[interpolation] self.flip = transforms.RandomHorizontalFlip(p=flip_p) def setup_templates(self, placeholder_token='sks', coarse_class_text='dog', reg=False, use_small_template=False): if reg: # NOTE: reg dataset if coarse_class_text: placeholder_string = f"{coarse_class_text}" templates = imagenet_templates_small if use_small_template else reg_templates_smallest templates = [t.format(placeholder_string) for t in templates] else: templates = reg_templates_no_class_small if use_small_template else reg_templates_no_class_smallest else: # NOTE: train dataset if coarse_class_text: placeholder_string = f"{placeholder_token} {coarse_class_text}" else: placeholder_string = f"{placeholder_token}" templates = imagenet_templates_small if use_small_template else reg_templates_smallest templates = [t.format(placeholder_string) for t in templates] return templates def __len__(self): return self._length def __getitem__(self, i): key = random.choice(self.keys) example = {} image = Image.open(self.image_paths[key][i % self.num_images[key]]) if not image.mode == "RGB": image = image.convert("RGB") # default to score-sde preprocessing img = np.array(image).astype(np.uint8) img = crop_image(img, size=self.size, cropping=self.cropping, crop_scale=self.crop_scale) image = Image.fromarray(img) if self.size is not None: image = image.resize((self.size, self.size), resample=self.interpolation) image = self.flip(image) image = np.array(image).astype(np.uint8) text = random.choice(self.templates[key]) example["caption"] = text.rstrip() example["image"] = (image / 127.5 - 1.0).astype(np.float32) example["label"] = [self.keys.index(key)] # import ipdb # ipdb.set_trace() return example class SinImageDataset(PersonalizedBase): def __init__( self, data_root, size=None, repeats=100, interpolation="bicubic", flip_p=0.5, set="train", placeholder_token="dog", per_image_tokens=False, center_crop=False, mixing_prob=0.25, coarse_class_text=None, reg = False ): self.data_root = data_root assert os.path.isfile(self.data_root), f"SinImageDataset requires a path to a image file, not a directory. Got {self.data_root}." self.image_paths = [self.data_root]*100 # self._length = len(self.image_paths) self.num_images = len(self.image_paths) self._length = self.num_images self.placeholder_token = placeholder_token self.per_image_tokens = per_image_tokens self.center_crop = center_crop self.mixing_prob = mixing_prob self.coarse_class_text = coarse_class_text if per_image_tokens: assert self.num_images < len(per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'." if set == "train": self._length = self.num_images * repeats self.size = size self.interpolation = {"linear": PIL.Image.LINEAR, "bilinear": PIL.Image.BILINEAR, "bicubic": PIL.Image.BICUBIC, "lanczos": PIL.Image.LANCZOS, }[interpolation] self.flip = transforms.RandomHorizontalFlip(p=flip_p) self.reg = reg class SinImageHighResDataset(Dataset): def __init__(self, data_root, size=512, high_resolution=1024, latent_scale=8, min_crop_frac=0.5, max_crop_frac=1.0, rec_prob=0.0, repeats=100, interpolation="bicubic", flip_p=0., set="train", placeholder_token="dog", per_image_tokens=False, mixing_prob=0.25, coarse_class_text=None): self.data_root = data_root assert os.path.isfile(self.data_root), f"SinImageDataset requires a path to a image file, not a directory. Got {self.data_root}." self.num_images = 100 self._length = self.num_images self.placeholder_token = placeholder_token self.per_image_tokens = per_image_tokens self.mixing_prob = mixing_prob self.coarse_class_text = coarse_class_text if per_image_tokens: assert self.num_images < len(per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'." if set == "train": self._length = self.num_images * repeats self.size = size self.high_resolution = high_resolution self.min_crop_frac = min_crop_frac self.max_crop_frac = max_crop_frac self.rec_prob = rec_prob self.latent_scale = latent_scale self.interpolation = {"linear": PIL.Image.LINEAR, "bilinear": PIL.Image.BILINEAR, "bicubic": PIL.Image.BICUBIC, "lanczos": PIL.Image.LANCZOS, }[interpolation] self.flip = transforms.RandomHorizontalFlip(p=flip_p) image = Image.open(self.data_root) if not image.mode == "RGB": image = image.convert("RGB") self.image = image.resize((self.high_resolution, self.high_resolution), resample=self.interpolation) def __len__(self): return self._length def _random_crop(self, pil_image): patch_size_y = int( (self.high_resolution//self.latent_scale) * (random.random() * (self.max_crop_frac - self.min_crop_frac) + self.min_crop_frac)) patch_size_x = int( (self.high_resolution//self.latent_scale) * (random.random() * (self.max_crop_frac - self.min_crop_frac) + self.min_crop_frac)) crop_y = random.randrange((self.high_resolution//self.latent_scale) - patch_size_y + 1) crop_x = random.randrange((self.high_resolution//self.latent_scale) - patch_size_x + 1) return pil_image.crop((crop_x * self.latent_scale, crop_y * self.latent_scale, (crop_x + patch_size_x) * self.latent_scale, (crop_y + patch_size_y) * self.latent_scale)).resize( (self.size, self.size), resample=self.interpolation), crop_y, crop_x, crop_y + patch_size_y, crop_x + patch_size_x def __getitem__(self, i): example = {} image = deepcopy(self.image) placeholder_string = self.placeholder_token if self.coarse_class_text: placeholder_string = f"{self.coarse_class_text} {placeholder_string}" text = random.choice(training_templates_smallest).format(placeholder_string) example["caption"] = text if random.random() < self.rec_prob: image, crop_y, crop_x, crop_y1, crop_x1 = self._random_crop(image) crop_area = torch.tensor([crop_y, crop_x, crop_y1, crop_x1]) else: image = image.resize((self.size, self.size), resample=self.interpolation) crop_area = torch.tensor([0, 0, self.high_resolution//self.latent_scale, self.high_resolution//self.latent_scale]) image = self.flip(image) image = np.array(image).astype(np.uint8) example["image"] = (image / 127.5 - 1.0).astype(np.float32) example['crop_boxes'] = crop_area return example ================================================ FILE: ldm/data/personalized_painting.py ================================================ import os import numpy as np import re import PIL from PIL import Image from torch.utils.data import Dataset from torchvision import transforms from copy import deepcopy import random import torch training_templates_smallest = [ 'painting of a sks {}', ] style_templates_smallest = [ '{} in the sks style', ] reg_templates_smallest = [ 'painting of a {}', ] reg_templates_smallest = [ 'photo of a {}', ] reg_templates_no_class_smallest = [ 'a photo', ] reg_templates_no_class_small = [ 'a photo', 'a rendering', 'a cropped photo', 'the photo', 'a dark photo', 'a close-up photo', 'a bright photo', 'a cropped photo', 'a good photo', 'a rendition', 'an illustration', 'a depiction', ] per_img_token_list = [ 'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'כ', 'ל', 'מ', 'נ', 'ס', 'ע', 'פ', 'צ', 'ק', 'ר', 'ש', 'ת', ] class PersonalizedBase(Dataset): def __init__(self, data_root, size=None, repeats=100, interpolation="bicubic", flip_p=0.5, set="train", placeholder_token="dog", per_image_tokens=False, center_crop=False, mixing_prob=0.25, coarse_class_text=None, reg=False, learn_style=False, ): self.data_root = data_root self.image_paths = [os.path.join( self.data_root, file_path) for file_path in os.listdir(self.data_root)] self.num_images = len(self.image_paths) self._length = self.num_images self.placeholder_token = placeholder_token self.per_image_tokens = per_image_tokens self.center_crop = center_crop self.mixing_prob = mixing_prob self.coarse_class_text = coarse_class_text self.learn_style = learn_style if per_image_tokens: assert self.num_images < len( per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'." if set == "train": self._length = self.num_images * repeats self.size = size self.interpolation = {"linear": PIL.Image.LINEAR, "bilinear": PIL.Image.BILINEAR, "bicubic": PIL.Image.BICUBIC, "lanczos": PIL.Image.LANCZOS, }[interpolation] self.flip = transforms.RandomHorizontalFlip(p=flip_p) self.reg = reg def __len__(self): return self._length def __getitem__(self, i): example = {} image = Image.open(self.image_paths[i % self.num_images]) if not image.mode == "RGB": image = image.convert("RGB") placeholder_string = self.placeholder_token if self.coarse_class_text: placeholder_string = f"{self.coarse_class_text} {placeholder_string}" if not self.reg: if not self.learn_style: text = random.choice(training_templates_smallest).format( placeholder_string) else: text = random.choice(style_templates_smallest).format( placeholder_string) else: text = random.choice(reg_templates_smallest).format( placeholder_string) example["caption"] = text # default to score-sde preprocessing img = np.array(image).astype(np.uint8) if self.center_crop: crop = min(img.shape[0], img.shape[1]) h, w, = img.shape[0], img.shape[1] img = img[(h - crop) // 2:(h + crop) // 2, (w - crop) // 2:(w + crop) // 2] image = Image.fromarray(img) if self.size is not None: image = image.resize((self.size, self.size), resample=self.interpolation) image = self.flip(image) image = np.array(image).astype(np.uint8) example["image"] = (image / 127.5 - 1.0).astype(np.float32) return example class SinImageDataset(PersonalizedBase): def __init__( self, data_root, size=None, repeats=100, interpolation="bicubic", flip_p=0.5, set="train", placeholder_token="dog", per_image_tokens=False, center_crop=False, mixing_prob=0.25, coarse_class_text=None, reg=False, learn_style=False, ): self.data_root = data_root assert os.path.isfile( self.data_root), f"SinImageDataset requires a path to a image file, not a directory. Got {self.data_root}." self.image_paths = [self.data_root]*100 self.num_images = len(self.image_paths) self._length = self.num_images self.placeholder_token = placeholder_token self.per_image_tokens = per_image_tokens self.center_crop = center_crop self.mixing_prob = mixing_prob self.coarse_class_text = coarse_class_text self.learn_style = learn_style if per_image_tokens: assert self.num_images < len( per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'." if set == "train": self._length = self.num_images * repeats self.size = size self.interpolation = {"linear": PIL.Image.LINEAR, "bilinear": PIL.Image.BILINEAR, "bicubic": PIL.Image.BICUBIC, "lanczos": PIL.Image.LANCZOS, }[interpolation] self.flip = transforms.RandomHorizontalFlip(p=flip_p) self.reg = reg class SinImageHighResDataset(Dataset): def __init__(self, data_root, size=512, high_resolution=1024, latent_scale=8, min_crop_frac=0.5, max_crop_frac=1.0, rec_prob=0.0, repeats=100, interpolation="bicubic", flip_p=0., set="train", placeholder_token="dog", per_image_tokens=False, mixing_prob=0.25, coarse_class_text=None): self.data_root = data_root assert os.path.isfile( self.data_root), f"SinImageDataset requires a path to a image file, not a directory. Got {self.data_root}." self.num_images = 100 self._length = self.num_images self.placeholder_token = placeholder_token self.per_image_tokens = per_image_tokens self.mixing_prob = mixing_prob self.coarse_class_text = coarse_class_text if per_image_tokens: assert self.num_images < len( per_img_token_list), f"Can't use per-image tokens when the training set contains more than {len(per_img_token_list)} tokens. To enable larger sets, add more tokens to 'per_img_token_list'." if set == "train": self._length = self.num_images * repeats self.size = size self.high_resolution = high_resolution self.min_crop_frac = min_crop_frac self.max_crop_frac = max_crop_frac self.rec_prob = rec_prob self.latent_scale = latent_scale self.interpolation = {"linear": PIL.Image.LINEAR, "bilinear": PIL.Image.BILINEAR, "bicubic": PIL.Image.BICUBIC, "lanczos": PIL.Image.LANCZOS, }[interpolation] self.flip = transforms.RandomHorizontalFlip(p=flip_p) image = Image.open(self.data_root) if not image.mode == "RGB": image = image.convert("RGB") self.image = image.resize( (self.high_resolution, self.high_resolution), resample=self.interpolation) def __len__(self): return self._length def _random_crop(self, pil_image): patch_size_y = int( (self.high_resolution//self.latent_scale) * (random.random() * (self.max_crop_frac - self.min_crop_frac) + self.min_crop_frac)) patch_size_x = int( (self.high_resolution//self.latent_scale) * (random.random() * (self.max_crop_frac - self.min_crop_frac) + self.min_crop_frac)) crop_y = random.randrange( (self.high_resolution//self.latent_scale) - patch_size_y + 1) crop_x = random.randrange( (self.high_resolution//self.latent_scale) - patch_size_x + 1) return pil_image.crop((crop_x * self.latent_scale, crop_y * self.latent_scale, (crop_x + patch_size_x) * self.latent_scale, (crop_y + patch_size_y) * self.latent_scale)).resize( (self.size, self.size), resample=self.interpolation), crop_y, crop_x, crop_y + patch_size_y, crop_x + patch_size_x def __getitem__(self, i): example = {} image = deepcopy(self.image) placeholder_string = self.placeholder_token if self.coarse_class_text: placeholder_string = f"{self.coarse_class_text} {placeholder_string}" text = random.choice(training_templates_smallest).format( placeholder_string) example["caption"] = text if random.random() < self.rec_prob: image, crop_y, crop_x, crop_y1, crop_x1 = self._random_crop(image) crop_area = torch.tensor([crop_y, crop_x, crop_y1, crop_x1]) else: image = image.resize((self.size, self.size), resample=self.interpolation) crop_area = torch.tensor( [0, 0, self.high_resolution//self.latent_scale, self.high_resolution//self.latent_scale]) image = self.flip(image) image = np.array(image).astype(np.uint8) example["image"] = (image / 127.5 - 1.0).astype(np.float32) example['crop_boxes'] = crop_area return example ================================================ FILE: ldm/lr_scheduler.py ================================================ import numpy as np class LambdaWarmUpCosineScheduler: """ note: use with a base_lr of 1.0 """ def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0): self.lr_warm_up_steps = warm_up_steps self.lr_start = lr_start self.lr_min = lr_min self.lr_max = lr_max self.lr_max_decay_steps = max_decay_steps self.last_lr = 0. self.verbosity_interval = verbosity_interval def schedule(self, n, **kwargs): if self.verbosity_interval > 0: if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}") if n < self.lr_warm_up_steps: lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start self.last_lr = lr return lr else: t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps) t = min(t, 1.0) lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * ( 1 + np.cos(t * np.pi)) self.last_lr = lr return lr def __call__(self, n, **kwargs): return self.schedule(n,**kwargs) class LambdaWarmUpCosineScheduler2: """ supports repeated iterations, configurable via lists note: use with a base_lr of 1.0. """ def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0): assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths) self.lr_warm_up_steps = warm_up_steps self.f_start = f_start self.f_min = f_min self.f_max = f_max self.cycle_lengths = cycle_lengths self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths)) self.last_f = 0. self.verbosity_interval = verbosity_interval def find_in_interval(self, n): interval = 0 for cl in self.cum_cycles[1:]: if n <= cl: return interval interval += 1 def schedule(self, n, **kwargs): cycle = self.find_in_interval(n) n = n - self.cum_cycles[cycle] if self.verbosity_interval > 0: if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " f"current cycle {cycle}") if n < self.lr_warm_up_steps[cycle]: f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle] self.last_f = f return f else: t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle]) t = min(t, 1.0) f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * ( 1 + np.cos(t * np.pi)) self.last_f = f return f def __call__(self, n, **kwargs): return self.schedule(n, **kwargs) class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2): def schedule(self, n, **kwargs): cycle = self.find_in_interval(n) n = n - self.cum_cycles[cycle] if self.verbosity_interval > 0: if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, " f"current cycle {cycle}") if n < self.lr_warm_up_steps[cycle]: f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle] self.last_f = f return f else: f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle]) self.last_f = f return f ================================================ FILE: ldm/modules/attention.py ================================================ from inspect import isfunction import math import torch import torch.nn.functional as F from torch import nn, einsum from einops import rearrange, repeat from ldm.modules.diffusionmodules.util import checkpoint def exists(val): return val is not None def uniq(arr): return{el: True for el in arr}.keys() def default(val, d): if exists(val): return val return d() if isfunction(d) else d def max_neg_value(t): return -torch.finfo(t.dtype).max def init_(tensor): dim = tensor.shape[-1] std = 1 / math.sqrt(dim) tensor.uniform_(-std, std) return tensor # feedforward class GEGLU(nn.Module): def __init__(self, dim_in, dim_out): super().__init__() self.proj = nn.Linear(dim_in, dim_out * 2) def forward(self, x): x, gate = self.proj(x).chunk(2, dim=-1) return x * F.gelu(gate) class FeedForward(nn.Module): def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.): super().__init__() inner_dim = int(dim * mult) dim_out = default(dim_out, dim) project_in = nn.Sequential( nn.Linear(dim, inner_dim), nn.GELU() ) if not glu else GEGLU(dim, inner_dim) self.net = nn.Sequential( project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out) ) def forward(self, x): return self.net(x) def zero_module(module): """ Zero out the parameters of a module and return it. """ for p in module.parameters(): p.detach().zero_() return module def Normalize(in_channels): return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) class LinearAttention(nn.Module): def __init__(self, dim, heads=4, dim_head=32): super().__init__() self.heads = heads hidden_dim = dim_head * heads self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False) self.to_out = nn.Conv2d(hidden_dim, dim, 1) def forward(self, x): b, c, h, w = x.shape qkv = self.to_qkv(x) q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3) k = k.softmax(dim=-1) context = torch.einsum('bhdn,bhen->bhde', k, v) out = torch.einsum('bhde,bhdn->bhen', context, q) out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w) return self.to_out(out) class SpatialSelfAttention(nn.Module): def __init__(self, in_channels): super().__init__() self.in_channels = in_channels self.norm = Normalize(in_channels) self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) def forward(self, x): h_ = x h_ = self.norm(h_) q = self.q(h_) k = self.k(h_) v = self.v(h_) # compute attention b,c,h,w = q.shape q = rearrange(q, 'b c h w -> b (h w) c') k = rearrange(k, 'b c h w -> b c (h w)') w_ = torch.einsum('bij,bjk->bik', q, k) w_ = w_ * (int(c)**(-0.5)) w_ = torch.nn.functional.softmax(w_, dim=2) # attend to values v = rearrange(v, 'b c h w -> b c (h w)') w_ = rearrange(w_, 'b i j -> b j i') h_ = torch.einsum('bij,bjk->bik', v, w_) h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h) h_ = self.proj_out(h_) return x+h_ class CrossAttention(nn.Module): def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.): super().__init__() inner_dim = dim_head * heads context_dim = default(context_dim, query_dim) self.scale = dim_head ** -0.5 self.heads = heads self.to_q = nn.Linear(query_dim, inner_dim, bias=False) self.to_k = nn.Linear(context_dim, inner_dim, bias=False) self.to_v = nn.Linear(context_dim, inner_dim, bias=False) self.to_out = nn.Sequential( nn.Linear(inner_dim, query_dim), nn.Dropout(dropout) ) def forward(self, x, context=None, mask=None): h = self.heads q = self.to_q(x) context = default(context, x) k = self.to_k(context) v = self.to_v(context) q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v)) sim = einsum('b i d, b j d -> b i j', q, k) * self.scale if exists(mask): mask = rearrange(mask, 'b ... -> b (...)') max_neg_value = -torch.finfo(sim.dtype).max mask = repeat(mask, 'b j -> (b h) () j', h=h) sim.masked_fill_(~mask, max_neg_value) # attention, what we cannot get enough of attn = sim.softmax(dim=-1) out = einsum('b i j, b j d -> b i d', attn, v) out = rearrange(out, '(b h) n d -> b n (h d)', h=h) return self.to_out(out) class BasicTransformerBlock(nn.Module): def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True): super().__init__() self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout) # is a self-attention self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff) self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout) # is self-attn if context is none self.norm1 = nn.LayerNorm(dim) self.norm2 = nn.LayerNorm(dim) self.norm3 = nn.LayerNorm(dim) self.checkpoint = checkpoint def forward(self, x, context=None): return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint) def _forward(self, x, context=None): x = self.attn1(self.norm1(x)) + x x = self.attn2(self.norm2(x), context=context) + x x = self.ff(self.norm3(x)) + x return x class SpatialTransformer(nn.Module): """ Transformer block for image-like data. First, project the input (aka embedding) and reshape to b, t, d. Then apply standard transformer action. Finally, reshape to image """ def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0., context_dim=None): super().__init__() self.in_channels = in_channels inner_dim = n_heads * d_head self.norm = Normalize(in_channels) self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0) self.transformer_blocks = nn.ModuleList( [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim) for d in range(depth)] ) self.proj_out = zero_module(nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)) def forward(self, x, context=None): # note: if no context is given, cross-attention defaults to self-attention b, c, h, w = x.shape x_in = x x = self.norm(x) x = self.proj_in(x) x = rearrange(x, 'b c h w -> b (h w) c') for block in self.transformer_blocks: x = block(x, context=context) x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w) x = self.proj_out(x) return x + x_in ================================================ FILE: ldm/modules/diffusionmodules/__init__.py ================================================ ================================================ FILE: ldm/modules/diffusionmodules/model.py ================================================ # pytorch_diffusion + derived encoder decoder import math import torch import torch.nn as nn import numpy as np from einops import rearrange from ldm.util import instantiate_from_config from ldm.modules.attention import LinearAttention def get_timestep_embedding(timesteps, embedding_dim): """ This matches the implementation in Denoising Diffusion Probabilistic Models: From Fairseq. Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of "Attention Is All You Need". """ assert len(timesteps.shape) == 1 half_dim = embedding_dim // 2 emb = math.log(10000) / (half_dim - 1) emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb) emb = emb.to(device=timesteps.device) emb = timesteps.float()[:, None] * emb[None, :] emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) if embedding_dim % 2 == 1: # zero pad emb = torch.nn.functional.pad(emb, (0,1,0,0)) return emb def nonlinearity(x): # swish return x*torch.sigmoid(x) def Normalize(in_channels, num_groups=32): return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True) class Upsample(nn.Module): def __init__(self, in_channels, with_conv): super().__init__() self.with_conv = with_conv if self.with_conv: self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) def forward(self, x): x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest") if self.with_conv: x = self.conv(x) return x class Downsample(nn.Module): def __init__(self, in_channels, with_conv): super().__init__() self.with_conv = with_conv if self.with_conv: # no asymmetric padding in torch conv, must do it ourselves self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0) def forward(self, x): if self.with_conv: pad = (0,1,0,1) x = torch.nn.functional.pad(x, pad, mode="constant", value=0) x = self.conv(x) else: x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) return x class ResnetBlock(nn.Module): def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False, dropout, temb_channels=512): super().__init__() self.in_channels = in_channels out_channels = in_channels if out_channels is None else out_channels self.out_channels = out_channels self.use_conv_shortcut = conv_shortcut self.norm1 = Normalize(in_channels) self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) if temb_channels > 0: self.temb_proj = torch.nn.Linear(temb_channels, out_channels) self.norm2 = Normalize(out_channels) self.dropout = torch.nn.Dropout(dropout) self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1) if self.in_channels != self.out_channels: if self.use_conv_shortcut: self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) else: self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) def forward(self, x, temb): h = x h = self.norm1(h) h = nonlinearity(h) h = self.conv1(h) if temb is not None: h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None] h = self.norm2(h) h = nonlinearity(h) h = self.dropout(h) h = self.conv2(h) if self.in_channels != self.out_channels: if self.use_conv_shortcut: x = self.conv_shortcut(x) else: x = self.nin_shortcut(x) return x+h class LinAttnBlock(LinearAttention): """to match AttnBlock usage""" def __init__(self, in_channels): super().__init__(dim=in_channels, heads=1, dim_head=in_channels) class AttnBlock(nn.Module): def __init__(self, in_channels): super().__init__() self.in_channels = in_channels self.norm = Normalize(in_channels) self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0) def forward(self, x): h_ = x h_ = self.norm(h_) q = self.q(h_) k = self.k(h_) v = self.v(h_) # compute attention b,c,h,w = q.shape q = q.reshape(b,c,h*w) q = q.permute(0,2,1) # b,hw,c k = k.reshape(b,c,h*w) # b,c,hw w_ = torch.bmm(q,k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j] w_ = w_ * (int(c)**(-0.5)) w_ = torch.nn.functional.softmax(w_, dim=2) # attend to values v = v.reshape(b,c,h*w) w_ = w_.permute(0,2,1) # b,hw,hw (first hw of k, second of q) h_ = torch.bmm(v,w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j] h_ = h_.reshape(b,c,h,w) h_ = self.proj_out(h_) return x+h_ def make_attn(in_channels, attn_type="vanilla"): assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown' print(f"making attention of type '{attn_type}' with {in_channels} in_channels") if attn_type == "vanilla": return AttnBlock(in_channels) elif attn_type == "none": return nn.Identity(in_channels) else: return LinAttnBlock(in_channels) class Model(nn.Module): def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks, attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels, resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"): super().__init__() if use_linear_attn: attn_type = "linear" self.ch = ch self.temb_ch = self.ch*4 self.num_resolutions = len(ch_mult) self.num_res_blocks = num_res_blocks self.resolution = resolution self.in_channels = in_channels self.use_timestep = use_timestep if self.use_timestep: # timestep embedding self.temb = nn.Module() self.temb.dense = nn.ModuleList([ torch.nn.Linear(self.ch, self.temb_ch), torch.nn.Linear(self.temb_ch, self.temb_ch), ]) # downsampling self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1) curr_res = resolution in_ch_mult = (1,)+tuple(ch_mult) self.down = nn.ModuleList() for i_level in range(self.num_resolutions): block = nn.ModuleList() attn = nn.ModuleList() block_in = ch*in_ch_mult[i_level] block_out = ch*ch_mult[i_level] for i_block in range(self.num_res_blocks): block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout)) block_in = block_out if curr_res in attn_resolutions: attn.append(make_attn(block_in, attn_type=attn_type)) down = nn.Module() down.block = block down.attn = attn if i_level != self.num_resolutions-1: down.downsample = Downsample(block_in, resamp_with_conv) curr_res = curr_res // 2 self.down.append(down) # middle self.mid = nn.Module() self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout) self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout) # upsampling self.up = nn.ModuleList() for i_level in reversed(range(self.num_resolutions)): block = nn.ModuleList() attn = nn.ModuleList() block_out = ch*ch_mult[i_level] skip_in = ch*ch_mult[i_level] for i_block in range(self.num_res_blocks+1): if i_block == self.num_res_blocks: skip_in = ch*in_ch_mult[i_level] block.append(ResnetBlock(in_channels=block_in+skip_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout)) block_in = block_out if curr_res in attn_resolutions: attn.append(make_attn(block_in, attn_type=attn_type)) up = nn.Module() up.block = block up.attn = attn if i_level != 0: up.upsample = Upsample(block_in, resamp_with_conv) curr_res = curr_res * 2 self.up.insert(0, up) # prepend to get consistent order # end self.norm_out = Normalize(block_in) self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1) def forward(self, x, t=None, context=None): #assert x.shape[2] == x.shape[3] == self.resolution if context is not None: # assume aligned context, cat along channel axis x = torch.cat((x, context), dim=1) if self.use_timestep: # timestep embedding assert t is not None temb = get_timestep_embedding(t, self.ch) temb = self.temb.dense[0](temb) temb = nonlinearity(temb) temb = self.temb.dense[1](temb) else: temb = None # downsampling hs = [self.conv_in(x)] for i_level in range(self.num_resolutions): for i_block in range(self.num_res_blocks): h = self.down[i_level].block[i_block](hs[-1], temb) if len(self.down[i_level].attn) > 0: h = self.down[i_level].attn[i_block](h) hs.append(h) if i_level != self.num_resolutions-1: hs.append(self.down[i_level].downsample(hs[-1])) # middle h = hs[-1] h = self.mid.block_1(h, temb) h = self.mid.attn_1(h) h = self.mid.block_2(h, temb) # upsampling for i_level in reversed(range(self.num_resolutions)): for i_block in range(self.num_res_blocks+1): h = self.up[i_level].block[i_block]( torch.cat([h, hs.pop()], dim=1), temb) if len(self.up[i_level].attn) > 0: h = self.up[i_level].attn[i_block](h) if i_level != 0: h = self.up[i_level].upsample(h) # end h = self.norm_out(h) h = nonlinearity(h) h = self.conv_out(h) return h def get_last_layer(self): return self.conv_out.weight class Encoder(nn.Module): def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks, attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels, resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla", **ignore_kwargs): super().__init__() if use_linear_attn: attn_type = "linear" self.ch = ch self.temb_ch = 0 self.num_resolutions = len(ch_mult) self.num_res_blocks = num_res_blocks self.resolution = resolution self.in_channels = in_channels # downsampling self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1) curr_res = resolution in_ch_mult = (1,)+tuple(ch_mult) self.in_ch_mult = in_ch_mult self.down = nn.ModuleList() for i_level in range(self.num_resolutions): block = nn.ModuleList() attn = nn.ModuleList() block_in = ch*in_ch_mult[i_level] block_out = ch*ch_mult[i_level] for i_block in range(self.num_res_blocks): block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout)) block_in = block_out if curr_res in attn_resolutions: attn.append(make_attn(block_in, attn_type=attn_type)) down = nn.Module() down.block = block down.attn = attn if i_level != self.num_resolutions-1: down.downsample = Downsample(block_in, resamp_with_conv) curr_res = curr_res // 2 self.down.append(down) # middle self.mid = nn.Module() self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout) self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout) # end self.norm_out = Normalize(block_in) self.conv_out = torch.nn.Conv2d(block_in, 2*z_channels if double_z else z_channels, kernel_size=3, stride=1, padding=1) def forward(self, x): # timestep embedding temb = None # downsampling hs = [self.conv_in(x)] for i_level in range(self.num_resolutions): for i_block in range(self.num_res_blocks): h = self.down[i_level].block[i_block](hs[-1], temb) if len(self.down[i_level].attn) > 0: h = self.down[i_level].attn[i_block](h) hs.append(h) if i_level != self.num_resolutions-1: hs.append(self.down[i_level].downsample(hs[-1])) # middle h = hs[-1] h = self.mid.block_1(h, temb) h = self.mid.attn_1(h) h = self.mid.block_2(h, temb) # end h = self.norm_out(h) h = nonlinearity(h) h = self.conv_out(h) return h class Decoder(nn.Module): def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks, attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels, resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False, attn_type="vanilla", **ignorekwargs): super().__init__() if use_linear_attn: attn_type = "linear" self.ch = ch self.temb_ch = 0 self.num_resolutions = len(ch_mult) self.num_res_blocks = num_res_blocks self.resolution = resolution self.in_channels = in_channels self.give_pre_end = give_pre_end self.tanh_out = tanh_out # compute in_ch_mult, block_in and curr_res at lowest res in_ch_mult = (1,)+tuple(ch_mult) block_in = ch*ch_mult[self.num_resolutions-1] curr_res = resolution // 2**(self.num_resolutions-1) self.z_shape = (1,z_channels,curr_res,curr_res) print("Working with z of shape {} = {} dimensions.".format( self.z_shape, np.prod(self.z_shape))) # z to block_in self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1) # middle self.mid = nn.Module() self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout) self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, temb_channels=self.temb_ch, dropout=dropout) # upsampling self.up = nn.ModuleList() for i_level in reversed(range(self.num_resolutions)): block = nn.ModuleList() attn = nn.ModuleList() block_out = ch*ch_mult[i_level] for i_block in range(self.num_res_blocks+1): block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout)) block_in = block_out if curr_res in attn_resolutions: attn.append(make_attn(block_in, attn_type=attn_type)) up = nn.Module() up.block = block up.attn = attn if i_level != 0: up.upsample = Upsample(block_in, resamp_with_conv) curr_res = curr_res * 2 self.up.insert(0, up) # prepend to get consistent order # end self.norm_out = Normalize(block_in) self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1) def forward(self, z): #assert z.shape[1:] == self.z_shape[1:] self.last_z_shape = z.shape # timestep embedding temb = None # z to block_in h = self.conv_in(z) # middle h = self.mid.block_1(h, temb) h = self.mid.attn_1(h) h = self.mid.block_2(h, temb) # upsampling for i_level in reversed(range(self.num_resolutions)): for i_block in range(self.num_res_blocks+1): h = self.up[i_level].block[i_block](h, temb) if len(self.up[i_level].attn) > 0: h = self.up[i_level].attn[i_block](h) if i_level != 0: h = self.up[i_level].upsample(h) # end if self.give_pre_end: return h h = self.norm_out(h) h = nonlinearity(h) h = self.conv_out(h) if self.tanh_out: h = torch.tanh(h) return h class SimpleDecoder(nn.Module): def __init__(self, in_channels, out_channels, *args, **kwargs): super().__init__() self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1), ResnetBlock(in_channels=in_channels, out_channels=2 * in_channels, temb_channels=0, dropout=0.0), ResnetBlock(in_channels=2 * in_channels, out_channels=4 * in_channels, temb_channels=0, dropout=0.0), ResnetBlock(in_channels=4 * in_channels, out_channels=2 * in_channels, temb_channels=0, dropout=0.0), nn.Conv2d(2*in_channels, in_channels, 1), Upsample(in_channels, with_conv=True)]) # end self.norm_out = Normalize(in_channels) self.conv_out = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) def forward(self, x): for i, layer in enumerate(self.model): if i in [1,2,3]: x = layer(x, None) else: x = layer(x) h = self.norm_out(x) h = nonlinearity(h) x = self.conv_out(h) return x class UpsampleDecoder(nn.Module): def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution, ch_mult=(2,2), dropout=0.0): super().__init__() # upsampling self.temb_ch = 0 self.num_resolutions = len(ch_mult) self.num_res_blocks = num_res_blocks block_in = in_channels curr_res = resolution // 2 ** (self.num_resolutions - 1) self.res_blocks = nn.ModuleList() self.upsample_blocks = nn.ModuleList() for i_level in range(self.num_resolutions): res_block = [] block_out = ch * ch_mult[i_level] for i_block in range(self.num_res_blocks + 1): res_block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, temb_channels=self.temb_ch, dropout=dropout)) block_in = block_out self.res_blocks.append(nn.ModuleList(res_block)) if i_level != self.num_resolutions - 1: self.upsample_blocks.append(Upsample(block_in, True)) curr_res = curr_res * 2 # end self.norm_out = Normalize(block_in) self.conv_out = torch.nn.Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1) def forward(self, x): # upsampling h = x for k, i_level in enumerate(range(self.num_resolutions)): for i_block in range(self.num_res_blocks + 1): h = self.res_blocks[i_level][i_block](h, None) if i_level != self.num_resolutions - 1: h = self.upsample_blocks[k](h) h = self.norm_out(h) h = nonlinearity(h) h = self.conv_out(h) return h class LatentRescaler(nn.Module): def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2): super().__init__() # residual block, interpolate, residual block self.factor = factor self.conv_in = nn.Conv2d(in_channels, mid_channels, kernel_size=3, stride=1, padding=1) self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels, out_channels=mid_channels, temb_channels=0, dropout=0.0) for _ in range(depth)]) self.attn = AttnBlock(mid_channels) self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels, out_channels=mid_channels, temb_channels=0, dropout=0.0) for _ in range(depth)]) self.conv_out = nn.Conv2d(mid_channels, out_channels, kernel_size=1, ) def forward(self, x): x = self.conv_in(x) for block in self.res_block1: x = block(x, None) x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor)))) x = self.attn(x) for block in self.res_block2: x = block(x, None) x = self.conv_out(x) return x class MergedRescaleEncoder(nn.Module): def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks, attn_resolutions, dropout=0.0, resamp_with_conv=True, ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1): super().__init__() intermediate_chn = ch * ch_mult[-1] self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult, z_channels=intermediate_chn, double_z=False, resolution=resolution, attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv, out_ch=None) self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn, mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth) def forward(self, x): x = self.encoder(x) x = self.rescaler(x) return x class MergedRescaleDecoder(nn.Module): def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8), dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1): super().__init__() tmp_chn = z_channels*ch_mult[-1] self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks, ch_mult=ch_mult, resolution=resolution, ch=ch) self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn, out_channels=tmp_chn, depth=rescale_module_depth) def forward(self, x): x = self.rescaler(x) x = self.decoder(x) return x class Upsampler(nn.Module): def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2): super().__init__() assert out_size >= in_size num_blocks = int(np.log2(out_size//in_size))+1 factor_up = 1.+ (out_size % in_size) print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}") self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels, out_channels=in_channels) self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2, attn_resolutions=[], in_channels=None, ch=in_channels, ch_mult=[ch_mult for _ in range(num_blocks)]) def forward(self, x): x = self.rescaler(x) x = self.decoder(x) return x class Resize(nn.Module): def __init__(self, in_channels=None, learned=False, mode="bilinear"): super().__init__() self.with_conv = learned self.mode = mode if self.with_conv: print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode") raise NotImplementedError() assert in_channels is not None # no asymmetric padding in torch conv, must do it ourselves self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=4, stride=2, padding=1) def forward(self, x, scale_factor=1.0): if scale_factor==1.0: return x else: x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor) return x class FirstStagePostProcessor(nn.Module): def __init__(self, ch_mult:list, in_channels, pretrained_model:nn.Module=None, reshape=False, n_channels=None, dropout=0., pretrained_config=None): super().__init__() if pretrained_config is None: assert pretrained_model is not None, 'Either "pretrained_model" or "pretrained_config" must not be None' self.pretrained_model = pretrained_model else: assert pretrained_config is not None, 'Either "pretrained_model" or "pretrained_config" must not be None' self.instantiate_pretrained(pretrained_config) self.do_reshape = reshape if n_channels is None: n_channels = self.pretrained_model.encoder.ch self.proj_norm = Normalize(in_channels,num_groups=in_channels//2) self.proj = nn.Conv2d(in_channels,n_channels,kernel_size=3, stride=1,padding=1) blocks = [] downs = [] ch_in = n_channels for m in ch_mult: blocks.append(ResnetBlock(in_channels=ch_in,out_channels=m*n_channels,dropout=dropout)) ch_in = m * n_channels downs.append(Downsample(ch_in, with_conv=False)) self.model = nn.ModuleList(blocks) self.downsampler = nn.ModuleList(downs) def instantiate_pretrained(self, config): model = instantiate_from_config(config) self.pretrained_model = model.eval() # self.pretrained_model.train = False for param in self.pretrained_model.parameters(): param.requires_grad = False @torch.no_grad() def encode_with_pretrained(self,x): c = self.pretrained_model.encode(x) if isinstance(c, DiagonalGaussianDistribution): c = c.mode() return c def forward(self,x): z_fs = self.encode_with_pretrained(x) z = self.proj_norm(z_fs) z = self.proj(z) z = nonlinearity(z) for submodel, downmodel in zip(self.model,self.downsampler): z = submodel(z,temb=None) z = downmodel(z) if self.do_reshape: z = rearrange(z,'b c h w -> b (h w) c') return z ================================================ FILE: ldm/modules/diffusionmodules/openaimodel.py ================================================ from abc import abstractmethod from decimal import HAVE_THREADS from email.base64mime import header_length from functools import partial import math from typing import Iterable import numpy as np import torch as th import torch.nn as nn import torch.nn.functional as F from ldm.modules.diffusionmodules.util import ( checkpoint, conv_nd, linear, avg_pool_nd, zero_module, normalization, timestep_embedding, ) from ldm.modules.attention import SpatialTransformer from pydantic import StrictInt, StrictFloat, StrictBool, StrictStr import torch from ldm.modules.diffusionmodules.positional_encoding import SinusoidalPositionalEmbedding from einops import rearrange, repeat # dummy replace def convert_module_to_f16(x): pass def convert_module_to_f32(x): pass # go class AttentionPool2d(nn.Module): """ Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py """ def __init__( self, spacial_dim: int, embed_dim: int, num_heads_channels: int, output_dim: int = None, ): super().__init__() self.positional_embedding = nn.Parameter( th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5) self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1) self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1) self.num_heads = embed_dim // num_heads_channels self.attention = QKVAttention(self.num_heads) def forward(self, x): b, c, *_spatial = x.shape x = x.reshape(b, c, -1) # NC(HW) x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1) x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1) x = self.qkv_proj(x) x = self.attention(x) x = self.c_proj(x) return x[:, :, 0] class TimestepBlock(nn.Module): """ Any module where forward() takes timestep embeddings as a second argument. """ @abstractmethod def forward(self, x, emb): """ Apply the module to `x` given `emb` timestep embeddings. """ class TimestepEmbedSequential(nn.Sequential, TimestepBlock): """ A sequential module that passes timestep embeddings to the children that support it as an extra input. """ def forward(self, x, emb, context=None): for layer in self: if isinstance(layer, TimestepBlock): x = layer(x, emb) elif isinstance(layer, SpatialTransformer): x = layer(x, context) else: x = layer(x) return x class Upsample(nn.Module): """ An upsampling layer with an optional convolution. :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then upsampling occurs in the inner-two dimensions. """ def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1): super().__init__() self.channels = channels self.out_channels = out_channels or channels self.use_conv = use_conv self.dims = dims if use_conv: self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding) def forward(self, x): assert x.shape[1] == self.channels if self.dims == 3: x = F.interpolate( x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest" ) else: x = F.interpolate(x, scale_factor=2, mode="nearest") if self.use_conv: x = self.conv(x) return x class TransposedUpsample(nn.Module): 'Learned 2x upsampling without padding' def __init__(self, channels, out_channels=None, ks=5): super().__init__() self.channels = channels self.out_channels = out_channels or channels self.up = nn.ConvTranspose2d( self.channels, self.out_channels, kernel_size=ks, stride=2) def forward(self, x): return self.up(x) class Downsample(nn.Module): """ A downsampling layer with an optional convolution. :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then downsampling occurs in the inner-two dimensions. """ def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1): super().__init__() self.channels = channels self.out_channels = out_channels or channels self.use_conv = use_conv self.dims = dims stride = 2 if dims != 3 else (1, 2, 2) if use_conv: self.op = conv_nd( dims, self.channels, self.out_channels, 3, stride=stride, padding=padding ) else: assert self.channels == self.out_channels self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) def forward(self, x): assert x.shape[1] == self.channels return self.op(x) class ResBlock(TimestepBlock): """ A residual block that can optionally change the number of channels. :param channels: the number of input channels. :param emb_channels: the number of timestep embedding channels. :param dropout: the rate of dropout. :param out_channels: if specified, the number of out channels. :param use_conv: if True and out_channels is specified, use a spatial convolution instead of a smaller 1x1 convolution to change the channels in the skip connection. :param dims: determines if the signal is 1D, 2D, or 3D. :param use_checkpoint: if True, use gradient checkpointing on this module. :param up: if True, use this block for upsampling. :param down: if True, use this block for downsampling. """ def __init__( self, channels, emb_channels, dropout, out_channels=None, use_conv=False, use_scale_shift_norm=False, dims=2, use_checkpoint=False, up=False, down=False, ): super().__init__() self.channels = channels self.emb_channels = emb_channels self.dropout = dropout self.out_channels = out_channels or channels self.use_conv = use_conv self.use_checkpoint = use_checkpoint self.use_scale_shift_norm = use_scale_shift_norm self.in_layers = nn.Sequential( normalization(channels), nn.SiLU(), conv_nd(dims, channels, self.out_channels, 3, padding=1), ) self.updown = up or down if up: self.h_upd = Upsample(channels, False, dims) self.x_upd = Upsample(channels, False, dims) elif down: self.h_upd = Downsample(channels, False, dims) self.x_upd = Downsample(channels, False, dims) else: self.h_upd = self.x_upd = nn.Identity() self.emb_layers = nn.Sequential( nn.SiLU(), linear( emb_channels, 2 * self.out_channels if use_scale_shift_norm else self.out_channels, ), ) self.out_layers = nn.Sequential( normalization(self.out_channels), nn.SiLU(), nn.Dropout(p=dropout), zero_module( conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1) ), ) if self.out_channels == channels: self.skip_connection = nn.Identity() elif use_conv: self.skip_connection = conv_nd( dims, channels, self.out_channels, 3, padding=1 ) else: self.skip_connection = conv_nd( dims, channels, self.out_channels, 1) def forward(self, x, emb): """ Apply the block to a Tensor, conditioned on a timestep embedding. :param x: an [N x C x ...] Tensor of features. :param emb: an [N x emb_channels] Tensor of timestep embeddings. :return: an [N x C x ...] Tensor of outputs. """ return checkpoint( self._forward, (x, emb), self.parameters(), self.use_checkpoint ) def _forward(self, x, emb): if self.updown: in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] h = in_rest(x) h = self.h_upd(h) x = self.x_upd(x) h = in_conv(h) else: h = self.in_layers(x) emb_out = self.emb_layers(emb).type(h.dtype) while len(emb_out.shape) < len(h.shape): emb_out = emb_out[..., None] if self.use_scale_shift_norm: out_norm, out_rest = self.out_layers[0], self.out_layers[1:] scale, shift = th.chunk(emb_out, 2, dim=1) h = out_norm(h) * (1 + scale) + shift h = out_rest(h) else: h = h + emb_out h = self.out_layers(h) return self.skip_connection(x) + h class AttentionBlock(nn.Module): """ An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted to the N-d case. https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. """ def __init__( self, channels, num_heads=1, num_head_channels=-1, use_checkpoint=False, use_new_attention_order=False, ): super().__init__() self.channels = channels if num_head_channels == -1: self.num_heads = num_heads else: assert ( channels % num_head_channels == 0 ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" self.num_heads = channels // num_head_channels self.use_checkpoint = use_checkpoint self.norm = normalization(channels) self.qkv = conv_nd(1, channels, channels * 3, 1) if use_new_attention_order: # split qkv before split heads self.attention = QKVAttention(self.num_heads) else: # split heads before split qkv self.attention = QKVAttentionLegacy(self.num_heads) self.proj_out = zero_module(conv_nd(1, channels, channels, 1)) def forward(self, x): # TODO: check checkpoint usage, is True # TODO: fix the .half call!!! return checkpoint(self._forward, (x,), self.parameters(), True) # return pt_checkpoint(self._forward, x) # pytorch def _forward(self, x): b, c, *spatial = x.shape x = x.reshape(b, c, -1) qkv = self.qkv(self.norm(x)) h = self.attention(qkv) h = self.proj_out(h) return (x + h).reshape(b, c, *spatial) def count_flops_attn(model, _x, y): """ A counter for the `thop` package to count the operations in an attention operation. Meant to be used like: macs, params = thop.profile( model, inputs=(inputs, timestamps), custom_ops={QKVAttention: QKVAttention.count_flops}, ) """ b, c, *spatial = y[0].shape num_spatial = int(np.prod(spatial)) # We perform two matmuls with the same number of ops. # The first computes the weight matrix, the second computes # the combination of the value vectors. matmul_ops = 2 * b * (num_spatial ** 2) * c model.total_ops += th.DoubleTensor([matmul_ops]) class QKVAttentionLegacy(nn.Module): """ A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping """ def __init__(self, n_heads): super().__init__() self.n_heads = n_heads def forward(self, qkv): """ Apply QKV attention. :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. :return: an [N x (H * C) x T] tensor after attention. """ bs, width, length = qkv.shape assert width % (3 * self.n_heads) == 0 ch = width // (3 * self.n_heads) q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1) scale = 1 / math.sqrt(math.sqrt(ch)) weight = th.einsum( "bct,bcs->bts", q * scale, k * scale ) # More stable with f16 than dividing afterwards weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) a = th.einsum("bts,bcs->bct", weight, v) return a.reshape(bs, -1, length) @staticmethod def count_flops(model, _x, y): return count_flops_attn(model, _x, y) class QKVAttention(nn.Module): """ A module which performs QKV attention and splits in a different order. """ def __init__(self, n_heads): super().__init__() self.n_heads = n_heads def forward(self, qkv): """ Apply QKV attention. :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs. :return: an [N x (H * C) x T] tensor after attention. """ bs, width, length = qkv.shape assert width % (3 * self.n_heads) == 0 ch = width // (3 * self.n_heads) q, k, v = qkv.chunk(3, dim=1) scale = 1 / math.sqrt(math.sqrt(ch)) weight = th.einsum( "bct,bcs->bts", (q * scale).view(bs * self.n_heads, ch, length), (k * scale).view(bs * self.n_heads, ch, length), ) # More stable with f16 than dividing afterwards weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length)) return a.reshape(bs, -1, length) @staticmethod def count_flops(model, _x, y): return count_flops_attn(model, _x, y) class UNetModel(nn.Module): """ The full UNet model with attention and timestep embedding. :param in_channels: channels in the input Tensor. :param model_channels: base channel count for the model. :param out_channels: channels in the output Tensor. :param num_res_blocks: number of residual blocks per downsample. :param attention_resolutions: a collection of downsample rates at which attention will take place. May be a set, list, or tuple. For example, if this contains 4, then at 4x downsampling, attention will be used. :param dropout: the dropout probability. :param channel_mult: channel multiplier for each level of the UNet. :param conv_resample: if True, use learned convolutions for upsampling and downsampling. :param dims: determines if the signal is 1D, 2D, or 3D. :param num_classes: if specified (as an int), then this model will be class-conditional with `num_classes` classes. :param use_checkpoint: use gradient checkpointing to reduce memory usage. :param num_heads: the number of attention heads in each attention layer. :param num_heads_channels: if specified, ignore num_heads and instead use a fixed channel width per attention head. :param num_heads_upsample: works with num_heads to set a different number of heads for upsampling. Deprecated. :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. :param resblock_updown: use residual blocks for up/downsampling. :param use_new_attention_order: use a different attention pattern for potentially increased efficiency. """ def __init__( self, image_size, in_channels, model_channels, out_channels, num_res_blocks, attention_resolutions, dropout=0, channel_mult=(1, 2, 4, 8), conv_resample=True, dims=2, num_classes=None, use_checkpoint=False, use_fp16=False, num_heads=-1, num_head_channels=-1, num_heads_upsample=-1, use_scale_shift_norm=False, resblock_updown=False, use_new_attention_order=False, use_spatial_transformer=False, # custom transformer support transformer_depth=1, # custom transformer support context_dim=None, # custom transformer support # custom support for prediction of discrete ids into codebook of first stage vq model n_embed=None, legacy=True, ): super().__init__() if use_spatial_transformer: assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...' if context_dim is not None: assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...' from omegaconf.listconfig import ListConfig if type(context_dim) == ListConfig: context_dim = list(context_dim) if num_heads_upsample == -1: num_heads_upsample = num_heads if num_heads == -1: assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set' if num_head_channels == -1: assert num_heads != -1, 'Either num_heads or num_head_channels has to be set' self.image_size = image_size self.in_channels = in_channels self.model_channels = model_channels self.out_channels = out_channels self.num_res_blocks = num_res_blocks self.attention_resolutions = attention_resolutions self.dropout = dropout self.channel_mult = channel_mult self.conv_resample = conv_resample self.num_classes = num_classes self.use_checkpoint = use_checkpoint self.dtype = th.float16 if use_fp16 else th.float32 self.num_heads = num_heads self.num_head_channels = num_head_channels self.num_heads_upsample = num_heads_upsample self.predict_codebook_ids = n_embed is not None time_embed_dim = model_channels * 4 self.time_embed = nn.Sequential( linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim), ) if self.num_classes is not None: self.label_emb = nn.Embedding(num_classes, time_embed_dim) self.input_blocks = nn.ModuleList( [ TimestepEmbedSequential( conv_nd(dims, in_channels, model_channels, 3, padding=1) ) ] ) self._feature_size = model_channels input_block_chans = [model_channels] ch = model_channels ds = 1 for level, mult in enumerate(channel_mult): for _ in range(num_res_blocks): layers = [ ResBlock( ch, time_embed_dim, dropout, out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm, ) ] ch = mult * model_channels if ds in attention_resolutions: if num_head_channels == -1: dim_head = ch // num_heads else: num_heads = ch // num_head_channels dim_head = num_head_channels if legacy: #num_heads = 1 dim_head = ch // num_heads if use_spatial_transformer else num_head_channels layers.append( AttentionBlock( ch, use_checkpoint=use_checkpoint, num_heads=num_heads, num_head_channels=dim_head, use_new_attention_order=use_new_attention_order, ) if not use_spatial_transformer else SpatialTransformer( ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim ) ) self.input_blocks.append(TimestepEmbedSequential(*layers)) self._feature_size += ch input_block_chans.append(ch) if level != len(channel_mult) - 1: out_ch = ch self.input_blocks.append( TimestepEmbedSequential( ResBlock( ch, time_embed_dim, dropout, out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm, down=True, ) if resblock_updown else Downsample( ch, conv_resample, dims=dims, out_channels=out_ch ) ) ) ch = out_ch input_block_chans.append(ch) ds *= 2 self._feature_size += ch if num_head_channels == -1: dim_head = ch // num_heads else: num_heads = ch // num_head_channels dim_head = num_head_channels if legacy: #num_heads = 1 dim_head = ch // num_heads if use_spatial_transformer else num_head_channels self.middle_block = TimestepEmbedSequential( ResBlock( ch, time_embed_dim, dropout, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm, ), AttentionBlock( ch, use_checkpoint=use_checkpoint, num_heads=num_heads, num_head_channels=dim_head, use_new_attention_order=use_new_attention_order, ) if not use_spatial_transformer else SpatialTransformer( ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim ), ResBlock( ch, time_embed_dim, dropout, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm, ), ) self._feature_size += ch self.output_blocks = nn.ModuleList([]) for level, mult in list(enumerate(channel_mult))[::-1]: for i in range(num_res_blocks + 1): ich = input_block_chans.pop() layers = [ ResBlock( ch + ich, time_embed_dim, dropout, out_channels=model_channels * mult, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm, ) ] ch = model_channels * mult if ds in attention_resolutions: if num_head_channels == -1: dim_head = ch // num_heads else: num_heads = ch // num_head_channels dim_head = num_head_channels if legacy: #num_heads = 1 dim_head = ch // num_heads if use_spatial_transformer else num_head_channels layers.append( AttentionBlock( ch, use_checkpoint=use_checkpoint, num_heads=num_heads_upsample, num_head_channels=dim_head, use_new_attention_order=use_new_attention_order, ) if not use_spatial_transformer else SpatialTransformer( ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim ) ) if level and i == num_res_blocks: out_ch = ch layers.append( ResBlock( ch, time_embed_dim, dropout, out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm, up=True, ) if resblock_updown else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch) ) ds //= 2 self.output_blocks.append(TimestepEmbedSequential(*layers)) self._feature_size += ch self.out = nn.Sequential( normalization(ch), nn.SiLU(), zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)), ) if self.predict_codebook_ids: self.id_predictor = nn.Sequential( normalization(ch), conv_nd(dims, model_channels, n_embed, 1), # nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits ) def convert_to_fp16(self): """ Convert the torso of the model to float16. """ self.input_blocks.apply(convert_module_to_f16) self.middle_block.apply(convert_module_to_f16) self.output_blocks.apply(convert_module_to_f16) def convert_to_fp32(self): """ Convert the torso of the model to float32. """ self.input_blocks.apply(convert_module_to_f32) self.middle_block.apply(convert_module_to_f32) self.output_blocks.apply(convert_module_to_f32) def forward(self, x, timesteps=None, context=None, y=None, **kwargs): """ Apply the model to an input batch. :param x: an [N x C x ...] Tensor of inputs. :param timesteps: a 1-D batch of timesteps. :param context: conditioning plugged in via crossattn :param y: an [N] Tensor of labels, if class-conditional. :return: an [N x C x ...] Tensor of outputs. """ assert (y is not None) == ( self.num_classes is not None ), "must specify y if and only if the model is class-conditional" hs = [] t_emb = timestep_embedding( timesteps, self.model_channels, repeat_only=False) emb = self.time_embed(t_emb) if self.num_classes is not None: assert y.shape == (x.shape[0],) emb = emb + self.label_emb(y) h = x.type(self.dtype) for module in self.input_blocks: h = module(h, emb, context) hs.append(h) h = self.middle_block(h, emb, context) for module in self.output_blocks: h = th.cat([h, hs.pop()], dim=1) h = module(h, emb, context) h = h.type(x.dtype) if self.predict_codebook_ids: return self.id_predictor(h) else: return self.out(h) class UNetModelPatch(UNetModel): def __init__( self, image_size, in_channels, model_channels, out_channels, num_res_blocks, attention_resolutions, dropout=0, channel_mult=(1, 2, 4, 8), conv_resample=True, dims=2, num_classes=None, use_checkpoint=False, use_fp16=False, num_heads=-1, num_head_channels=-1, num_heads_upsample=-1, use_scale_shift_norm=False, resblock_updown=False, use_new_attention_order=False, use_spatial_transformer=False, # custom transformer support transformer_depth=1, # custom transformer support context_dim=None, # custom transformer support # custom support for prediction of discrete ids into codebook of first stage vq model n_embed=None, legacy=True, padding_idx: StrictInt = 0, init_size: StrictInt = 512, div_half_dim: StrictBool = False, center_shift: StrictInt = 200, interpolation_mode: StrictStr = "bilinear", ): super().__init__(image_size, in_channels=in_channels, model_channels=model_channels, out_channels=out_channels, num_res_blocks=num_res_blocks, attention_resolutions=attention_resolutions, dropout=dropout, channel_mult=channel_mult, conv_resample=conv_resample, dims=dims, num_classes=num_classes, use_checkpoint=use_checkpoint, use_fp16=use_fp16, num_heads=num_heads, num_head_channels=num_head_channels, num_heads_upsample=num_heads_upsample, use_scale_shift_norm=use_scale_shift_norm, resblock_updown=resblock_updown, use_new_attention_order=use_new_attention_order, use_spatial_transformer=use_spatial_transformer, # custom transformer support transformer_depth=transformer_depth, # custom transformer support context_dim=context_dim, # custom transformer support # custom support for prediction of discrete ids into codebook of first stage vq model n_embed=n_embed, legacy=legacy,) assert model_channels % 2 == 0 self.head_position_encode = SinusoidalPositionalEmbedding(embedding_dim=model_channels//2, padding_idx=padding_idx, init_size=init_size, div_half_dim=div_half_dim, center_shift=center_shift) self.init_size = init_size self.interpolation_mode = interpolation_mode def forward(self, x, timesteps=None, context=None, y=None, crop_boxes=None, **kwargs): """ Apply the model to an input batch. :param x: an [N x C x ...] Tensor of inputs. :param timesteps: a 1-D batch of timesteps. :param context: conditioning plugged in via crossattn :param y: an [N] Tensor of labels, if class-conditional. :return: an [N x C x ...] Tensor of outputs. """ assert (y is not None) == ( self.num_classes is not None ), "must specify y if and only if the model is class-conditional" hs = [] t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False) emb = self.time_embed(t_emb) head_grid = self.head_position_encode(torch.ones([x.shape[0], x.shape[1], self.init_size, self.init_size], dtype=self.dtype, device=x.device)) if self.interpolation_mode == 'bilinear': if crop_boxes is not None: head_grid = torch.cat([F.interpolate(hg.unsqueeze(0)[:, :, box[0]: box[2], box[1]: box[3]], (x.shape[2], x.shape[3]), mode='bilinear', align_corners=True) for hg, box in zip(head_grid, crop_boxes)], dim=0) else: head_grid = F.interpolate(head_grid, (x.shape[2], x.shape[3]), mode='bilinear', align_corners=True) elif self.interpolation_mode == 'nearest': if crop_boxes is not None: head_grid = torch.cat([F.interpolate(hg.unsqueeze(0)[:, :, box[0]: box[2], box[1]: box[3]], (x.shape[2], x.shape[3]), mode='nearest') for hg, box in zip(head_grid, crop_boxes)], dim=0) else: head_grid = F.interpolate(head_grid, (x.shape[2], x.shape[3]), mode='nearest') else: raise NotImplementedError if self.num_classes is not None: assert y.shape == (x.shape[0],) emb = emb + self.label_emb(y) # import ipdb # ipdb.set_trace() h = x.type(self.dtype) for i, module in enumerate(self.input_blocks): h = module(h, emb, context) if i == 0: h += head_grid hs.append(h) h = self.middle_block(h, emb, context) for module in self.output_blocks: h = th.cat([h, hs.pop()], dim=1) h = module(h, emb, context) h = h.type(x.dtype) if self.predict_codebook_ids: return self.id_predictor(h) else: return self.out(h) class EncoderUNetModel(nn.Module): """ The half UNet model with attention and timestep embedding. For usage, see UNet. """ def __init__( self, image_size, in_channels, model_channels, out_channels, num_res_blocks, attention_resolutions, dropout=0, channel_mult=(1, 2, 4, 8), conv_resample=True, dims=2, use_checkpoint=False, use_fp16=False, num_heads=1, num_head_channels=-1, num_heads_upsample=-1, use_scale_shift_norm=False, resblock_updown=False, use_new_attention_order=False, pool="adaptive", *args, **kwargs ): super().__init__() if num_heads_upsample == -1: num_heads_upsample = num_heads self.in_channels = in_channels self.model_channels = model_channels self.out_channels = out_channels self.num_res_blocks = num_res_blocks self.attention_resolutions = attention_resolutions self.dropout = dropout self.channel_mult = channel_mult self.conv_resample = conv_resample self.use_checkpoint = use_checkpoint self.dtype = th.float16 if use_fp16 else th.float32 self.num_heads = num_heads self.num_head_channels = num_head_channels self.num_heads_upsample = num_heads_upsample time_embed_dim = model_channels * 4 self.time_embed = nn.Sequential( linear(model_channels, time_embed_dim), nn.SiLU(), linear(time_embed_dim, time_embed_dim), ) self.input_blocks = nn.ModuleList( [ TimestepEmbedSequential( conv_nd(dims, in_channels, model_channels, 3, padding=1) ) ] ) self._feature_size = model_channels input_block_chans = [model_channels] ch = model_channels ds = 1 for level, mult in enumerate(channel_mult): for _ in range(num_res_blocks): layers = [ ResBlock( ch, time_embed_dim, dropout, out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm, ) ] ch = mult * model_channels if ds in attention_resolutions: layers.append( AttentionBlock( ch, use_checkpoint=use_checkpoint, num_heads=num_heads, num_head_channels=num_head_channels, use_new_attention_order=use_new_attention_order, ) ) self.input_blocks.append(TimestepEmbedSequential(*layers)) self._feature_size += ch input_block_chans.append(ch) if level != len(channel_mult) - 1: out_ch = ch self.input_blocks.append( TimestepEmbedSequential( ResBlock( ch, time_embed_dim, dropout, out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm, down=True, ) if resblock_updown else Downsample( ch, conv_resample, dims=dims, out_channels=out_ch ) ) ) ch = out_ch input_block_chans.append(ch) ds *= 2 self._feature_size += ch self.middle_block = TimestepEmbedSequential( ResBlock( ch, time_embed_dim, dropout, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm, ), AttentionBlock( ch, use_checkpoint=use_checkpoint, num_heads=num_heads, num_head_channels=num_head_channels, use_new_attention_order=use_new_attention_order, ), ResBlock( ch, time_embed_dim, dropout, dims=dims, use_checkpoint=use_checkpoint, use_scale_shift_norm=use_scale_shift_norm, ), ) self._feature_size += ch self.pool = pool if pool == "adaptive": self.out = nn.Sequential( normalization(ch), nn.SiLU(), nn.AdaptiveAvgPool2d((1, 1)), zero_module(conv_nd(dims, ch, out_channels, 1)), nn.Flatten(), ) elif pool == "attention": assert num_head_channels != -1 self.out = nn.Sequential( normalization(ch), nn.SiLU(), AttentionPool2d( (image_size // ds), ch, num_head_channels, out_channels ), ) elif pool == "spatial": self.out = nn.Sequential( nn.Linear(self._feature_size, 2048), nn.ReLU(), nn.Linear(2048, self.out_channels), ) elif pool == "spatial_v2": self.out = nn.Sequential( nn.Linear(self._feature_size, 2048), normalization(2048), nn.SiLU(), nn.Linear(2048, self.out_channels), ) else: raise NotImplementedError(f"Unexpected {pool} pooling") def convert_to_fp16(self): """ Convert the torso of the model to float16. """ self.input_blocks.apply(convert_module_to_f16) self.middle_block.apply(convert_module_to_f16) def convert_to_fp32(self): """ Convert the torso of the model to float32. """ self.input_blocks.apply(convert_module_to_f32) self.middle_block.apply(convert_module_to_f32) def forward(self, x, timesteps): """ Apply the model to an input batch. :param x: an [N x C x ...] Tensor of inputs. :param timesteps: a 1-D batch of timesteps. :return: an [N x K] Tensor of outputs. """ emb = self.time_embed(timestep_embedding( timesteps, self.model_channels)) results = [] h = x.type(self.dtype) for module in self.input_blocks: h = module(h, emb) if self.pool.startswith("spatial"): results.append(h.type(x.dtype).mean(dim=(2, 3))) h = self.middle_block(h, emb) if self.pool.startswith("spatial"): results.append(h.type(x.dtype).mean(dim=(2, 3))) h = th.cat(results, axis=-1) return self.out(h) else: h = h.type(x.dtype) return self.out(h) ================================================ FILE: ldm/modules/diffusionmodules/positional_encoding.py ================================================ #!/usr/bin/env python # encoding: utf-8 # @Time : 3/31/22 21:42 # @Author : Zhixing Zhang # @File : positional_encoding.py # @Contact : zhixing.zhang@rutgers.edu # @Desc : import numpy as np import torch import torch.nn as nn class SinusoidalPositionalEmbedding(nn.Module): """Sinusoidal Positional Embedding 1D or 2D (SPE/SPE2d). This module is a modified from: https://github.com/pytorch/fairseq/blob/master/fairseq/modules/sinusoidal_positional_embedding.py # noqa Based on the original SPE in single dimension, we implement a 2D sinusoidal positional encodding (SPE2d), as introduced in Positional Encoding as Spatial Inductive Bias in GANs, CVPR'2021. Args: embedding_dim (int): The number of dimensions for the positional encoding. padding_idx (int | list[int]): The index for the padding contents. The padding positions will obtain an encoding vector filling in zeros. init_size (int, optional): The initial size of the positional buffer. Defaults to 1024. div_half_dim (bool, optional): If true, the embedding will be divided by :math:`d/2`. Otherwise, it will be divided by :math:`(d/2 -1)`. Defaults to False. center_shift (int | None, optional): Shift the center point to some index. Defaults to None. """ def __init__(self, embedding_dim, padding_idx, init_size=1024, div_half_dim=False, center_shift=None): super().__init__() self.embedding_dim = embedding_dim self.padding_idx = padding_idx self.div_half_dim = div_half_dim self.center_shift = center_shift self.weights = SinusoidalPositionalEmbedding.get_embedding( init_size, embedding_dim, padding_idx, self.div_half_dim) self.register_buffer('_float_tensor', torch.FloatTensor(1)) self.max_positions = int(1e5) @staticmethod def get_embedding(num_embeddings, embedding_dim, padding_idx=None, div_half_dim=False): """Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of "Attention Is All You Need". """ assert embedding_dim % 2 == 0, ( 'In this version, we request ' f'embedding_dim divisible by 2 but got {embedding_dim}') # there is a little difference from the original paper. half_dim = embedding_dim // 2 if not div_half_dim: emb = np.log(10000) / (half_dim - 1) else: emb = np.log(1e4) / half_dim # compute exp(-log10000 / d * i) emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) emb = torch.arange( num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0) emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1) if padding_idx is not None: emb[padding_idx, :] = 0 return emb def forward(self, input, **kwargs): """Input is expected to be of size [bsz x seqlen]. Returned tensor is expected to be of size [bsz x seq_len x emb_dim] """ assert input.dim() == 2 or input.dim( ) == 4, 'Input dimension should be 2 (1D) or 4(2D)' if input.dim() == 4: return self.make_grid2d_like(input, **kwargs) b, seq_len = input.shape max_pos = self.padding_idx + 1 + seq_len if self.weights is None or max_pos > self.weights.size(0): # recompute/expand embedding if needed self.weights = SinusoidalPositionalEmbedding.get_embedding( max_pos, self.embedding_dim, self.padding_idx) self.weights = self.weights.to(self._float_tensor) positions = self.make_positions(input, self.padding_idx).to( self._float_tensor.device) return self.weights.index_select(0, positions.view(-1)).view( b, seq_len, self.embedding_dim).detach() def make_positions(self, input, padding_idx): mask = input.ne(padding_idx).int() return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx def make_grid2d(self, height, width, num_batches=1, center_shift=None): h, w = height, width # if `center_shift` is not given from the outside, use # `self.center_shift` if center_shift is None: center_shift = self.center_shift h_shift = 0 w_shift = 0 # center shift to the input grid if center_shift is not None: # if h/w is even, the left center should be aligned with # center shift if h % 2 == 0: h_left_center = h // 2 h_shift = center_shift - h_left_center else: h_center = h // 2 + 1 h_shift = center_shift - h_center if w % 2 == 0: w_left_center = w // 2 w_shift = center_shift - w_left_center else: w_center = w // 2 + 1 w_shift = center_shift - w_center # Note that the index is started from 1 since zero will be padding idx. # axis -- (b, h or w) x_axis = torch.arange(1, w + 1).unsqueeze(0).repeat(num_batches, 1) + w_shift y_axis = torch.arange(1, h + 1).unsqueeze(0).repeat(num_batches, 1) + h_shift # emb -- (b, emb_dim, h or w) x_emb = self(x_axis).transpose(1, 2) y_emb = self(y_axis).transpose(1, 2) # make grid for x/y axis # Note that repeat will copy data. If use learned emb, expand may be # better. x_grid = x_emb.unsqueeze(2).repeat(1, 1, h, 1) y_grid = y_emb.unsqueeze(3).repeat(1, 1, 1, w) # cat grid -- (b, 2 x emb_dim, h, w) grid = torch.cat([x_grid, y_grid], dim=1) return grid.detach() def make_grid2d_like(self, x, center_shift=None): """Input tensor with shape of (b, ..., h, w) Return tensor with shape of (b, 2 x emb_dim, h, w) Note that the positional embedding highly depends on the the function, ``make_positions``. """ h, w = x.shape[-2:] grid = self.make_grid2d(h, w, x.size(0), center_shift) return grid.to(x) class CatersianGrid(nn.Module): """Catersian Grid for 2d tensor. The Catersian Grid is a common-used positional encoding in deep learning. In this implementation, we follow the convention of ``grid_sample`` in PyTorch. In other words, ``[-1, -1]`` denotes the left-top corner while ``[1, 1]`` denotes the right-botton corner. """ def forward(self, x, **kwargs): assert x.dim() == 4 return self.make_grid2d_like(x, **kwargs) def make_grid2d(self, height, width, num_batches=1, requires_grad=False): h, w = height, width grid_y, grid_x = torch.meshgrid(torch.arange(0, h), torch.arange(0, w)) grid_x = 2 * grid_x / max(float(w) - 1., 1.) - 1. grid_y = 2 * grid_y / max(float(h) - 1., 1.) - 1. grid = torch.stack((grid_x, grid_y), 0) grid.requires_grad = requires_grad grid = torch.unsqueeze(grid, 0) grid = grid.repeat(num_batches, 1, 1, 1) return grid def make_grid2d_like(self, x, requires_grad=False): h, w = x.shape[-2:] grid = self.make_grid2d(h, w, x.size(0), requires_grad=requires_grad) return grid.to(x) ================================================ FILE: ldm/modules/diffusionmodules/util.py ================================================ # adopted from # https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py # and # https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py # and # https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py # # thanks! import os import math import torch import torch.nn as nn import numpy as np from einops import repeat from ldm.util import instantiate_from_config def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3): if schedule == "linear": betas = ( torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2 ) elif schedule == "cosine": timesteps = ( torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s ) alphas = timesteps / (1 + cosine_s) * np.pi / 2 alphas = torch.cos(alphas).pow(2) alphas = alphas / alphas[0] betas = 1 - alphas[1:] / alphas[:-1] betas = np.clip(betas, a_min=0, a_max=0.999) elif schedule == "sqrt_linear": betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) elif schedule == "sqrt": betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5 else: raise ValueError(f"schedule '{schedule}' unknown.") return betas.numpy() def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True): if ddim_discr_method == 'uniform': c = num_ddpm_timesteps // num_ddim_timesteps ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c))) elif ddim_discr_method == 'quad': ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int) else: raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"') # assert ddim_timesteps.shape[0] == num_ddim_timesteps # add one to get the final alpha values right (the ones from first scale to data during sampling) steps_out = ddim_timesteps + 1 if verbose: print(f'Selected timesteps for ddim sampler: {steps_out}') return steps_out def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True): # select alphas for computing the variance schedule alphas = alphacums[ddim_timesteps] alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist()) # according the the formula provided in https://arxiv.org/abs/2010.02502 sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)) if verbose: print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}') print(f'For the chosen value of eta, which is {eta}, ' f'this results in the following sigma_t schedule for ddim sampler {sigmas}') return sigmas, alphas, alphas_prev def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. :param num_diffusion_timesteps: the number of betas to produce. :param alpha_bar: a lambda that takes an argument t from 0 to 1 and produces the cumulative product of (1-beta) up to that part of the diffusion process. :param max_beta: the maximum beta to use; use values lower than 1 to prevent singularities. """ betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) return np.array(betas) def extract_into_tensor(a, t, x_shape): b, *_ = t.shape out = a.gather(-1, t) return out.reshape(b, *((1,) * (len(x_shape) - 1))) def checkpoint(func, inputs, params, flag): """ Evaluate a function without caching intermediate activations, allowing for reduced memory at the expense of extra compute in the backward pass. :param func: the function to evaluate. :param inputs: the argument sequence to pass to `func`. :param params: a sequence of parameters `func` depends on but does not explicitly take as arguments. :param flag: if False, disable gradient checkpointing. """ if False: # disabled checkpointing to allow requires_grad = False for main model args = tuple(inputs) + tuple(params) return CheckpointFunction.apply(func, len(inputs), *args) else: return func(*inputs) class CheckpointFunction(torch.autograd.Function): @staticmethod def forward(ctx, run_function, length, *args): ctx.run_function = run_function ctx.input_tensors = list(args[:length]) ctx.input_params = list(args[length:]) with torch.no_grad(): output_tensors = ctx.run_function(*ctx.input_tensors) return output_tensors @staticmethod def backward(ctx, *output_grads): ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors] with torch.enable_grad(): # Fixes a bug where the first op in run_function modifies the # Tensor storage in place, which is not allowed for detach()'d # Tensors. shallow_copies = [x.view_as(x) for x in ctx.input_tensors] output_tensors = ctx.run_function(*shallow_copies) input_grads = torch.autograd.grad( output_tensors, ctx.input_tensors + ctx.input_params, output_grads, allow_unused=True, ) del ctx.input_tensors del ctx.input_params del output_tensors return (None, None) + input_grads def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False): """ Create sinusoidal timestep embeddings. :param timesteps: a 1-D Tensor of N indices, one per batch element. These may be fractional. :param dim: the dimension of the output. :param max_period: controls the minimum frequency of the embeddings. :return: an [N x dim] Tensor of positional embeddings. """ if not repeat_only: half = dim // 2 freqs = torch.exp( -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half ).to(device=timesteps.device) args = timesteps[:, None].float() * freqs[None] embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) if dim % 2: embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) else: embedding = repeat(timesteps, 'b -> b d', d=dim) return embedding def zero_module(module): """ Zero out the parameters of a module and return it. """ for p in module.parameters(): p.detach().zero_() return module def scale_module(module, scale): """ Scale the parameters of a module and return it. """ for p in module.parameters(): p.detach().mul_(scale) return module def mean_flat(tensor): """ Take the mean over all non-batch dimensions. """ return tensor.mean(dim=list(range(1, len(tensor.shape)))) def normalization(channels): """ Make a standard normalization layer. :param channels: number of input channels. :return: an nn.Module for normalization. """ return GroupNorm32(32, channels) # PyTorch 1.7 has SiLU, but we support PyTorch 1.5. class SiLU(nn.Module): def forward(self, x): return x * torch.sigmoid(x) class GroupNorm32(nn.GroupNorm): def forward(self, x): return super().forward(x.float()).type(x.dtype) def conv_nd(dims, *args, **kwargs): """ Create a 1D, 2D, or 3D convolution module. """ if dims == 1: return nn.Conv1d(*args, **kwargs) elif dims == 2: return nn.Conv2d(*args, **kwargs) elif dims == 3: return nn.Conv3d(*args, **kwargs) raise ValueError(f"unsupported dimensions: {dims}") def linear(*args, **kwargs): """ Create a linear module. """ return nn.Linear(*args, **kwargs) def avg_pool_nd(dims, *args, **kwargs): """ Create a 1D, 2D, or 3D average pooling module. """ if dims == 1: return nn.AvgPool1d(*args, **kwargs) elif dims == 2: return nn.AvgPool2d(*args, **kwargs) elif dims == 3: return nn.AvgPool3d(*args, **kwargs) raise ValueError(f"unsupported dimensions: {dims}") class HybridConditioner(nn.Module): def __init__(self, c_concat_config, c_crossattn_config): super().__init__() self.concat_conditioner = instantiate_from_config(c_concat_config) self.crossattn_conditioner = instantiate_from_config(c_crossattn_config) def forward(self, c_concat, c_crossattn): c_concat = self.concat_conditioner(c_concat) c_crossattn = self.crossattn_conditioner(c_crossattn) return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]} def noise_like(shape, device, repeat=False): repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1))) noise = lambda: torch.randn(shape, device=device) return repeat_noise() if repeat else noise() ================================================ FILE: ldm/modules/distributions/__init__.py ================================================ ================================================ FILE: ldm/modules/distributions/distributions.py ================================================ import torch import numpy as np class AbstractDistribution: def sample(self): raise NotImplementedError() def mode(self): raise NotImplementedError() class DiracDistribution(AbstractDistribution): def __init__(self, value): self.value = value def sample(self): return self.value def mode(self): return self.value class DiagonalGaussianDistribution(object): def __init__(self, parameters, deterministic=False): self.parameters = parameters self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) self.logvar = torch.clamp(self.logvar, -30.0, 20.0) self.deterministic = deterministic self.std = torch.exp(0.5 * self.logvar) self.var = torch.exp(self.logvar) if self.deterministic: self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) def sample(self): x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device) return x def kl(self, other=None): if self.deterministic: return torch.Tensor([0.]) else: if other is None: return 0.5 * torch.sum(torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, dim=[1, 2, 3]) else: return 0.5 * torch.sum( torch.pow(self.mean - other.mean, 2) / other.var + self.var / other.var - 1.0 - self.logvar + other.logvar, dim=[1, 2, 3]) def nll(self, sample, dims=[1,2,3]): if self.deterministic: return torch.Tensor([0.]) logtwopi = np.log(2.0 * np.pi) return 0.5 * torch.sum( logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, dim=dims) def mode(self): return self.mean def normal_kl(mean1, logvar1, mean2, logvar2): """ source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 Compute the KL divergence between two gaussians. Shapes are automatically broadcasted, so batches can be compared to scalars, among other use cases. """ tensor = None for obj in (mean1, logvar1, mean2, logvar2): if isinstance(obj, torch.Tensor): tensor = obj break assert tensor is not None, "at least one argument must be a Tensor" # Force variances to be Tensors. Broadcasting helps convert scalars to # Tensors, but it does not work for torch.exp(). logvar1, logvar2 = [ x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) for x in (logvar1, logvar2) ] return 0.5 * ( -1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) ) ================================================ FILE: ldm/modules/ema.py ================================================ import torch from torch import nn class LitEma(nn.Module): def __init__(self, model, decay=0.9999, use_num_upates=True): super().__init__() if decay < 0.0 or decay > 1.0: raise ValueError('Decay must be between 0 and 1') self.m_name2s_name = {} self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32)) self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates else torch.tensor(-1,dtype=torch.int)) for name, p in model.named_parameters(): if p.requires_grad: #remove as '.'-character is not allowed in buffers s_name = name.replace('.','') self.m_name2s_name.update({name:s_name}) self.register_buffer(s_name,p.clone().detach().data) self.collected_params = [] def forward(self,model): decay = self.decay if self.num_updates >= 0: self.num_updates += 1 decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates)) one_minus_decay = 1.0 - decay with torch.no_grad(): m_param = dict(model.named_parameters()) shadow_params = dict(self.named_buffers()) for key in m_param: if m_param[key].requires_grad: sname = self.m_name2s_name[key] shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) else: assert not key in self.m_name2s_name def copy_to(self, model): m_param = dict(model.named_parameters()) shadow_params = dict(self.named_buffers()) for key in m_param: if m_param[key].requires_grad: m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) else: assert not key in self.m_name2s_name def store(self, parameters): """ Save the current parameters for restoring later. Args: parameters: Iterable of `torch.nn.Parameter`; the parameters to be temporarily stored. """ self.collected_params = [param.clone() for param in parameters] def restore(self, parameters): """ Restore the parameters stored with the `store` method. Useful to validate the model with EMA parameters without affecting the original optimization process. Store the parameters before the `copy_to` method. After validation (or model saving), use this to restore the former parameters. Args: parameters: Iterable of `torch.nn.Parameter`; the parameters to be updated with the stored parameters. """ for c_param, param in zip(self.collected_params, parameters): param.data.copy_(c_param.data) ================================================ FILE: ldm/modules/embedding_manager.py ================================================ import torch from torch import nn from ldm.data.personalized import per_img_token_list from transformers import CLIPTokenizer from functools import partial DEFAULT_PLACEHOLDER_TOKEN = ["*"] PROGRESSIVE_SCALE = 2000 def get_clip_token_for_string(tokenizer, string): batch_encoding = tokenizer(string, truncation=True, max_length=77, return_length=True, return_overflowing_tokens=False, padding="max_length", return_tensors="pt") tokens = batch_encoding["input_ids"] assert torch.count_nonzero(tokens - 49407) == 2, f"String '{string}' maps to more than a single token. Please use another string" return tokens[0, 1] def get_bert_token_for_string(tokenizer, string): token = tokenizer(string) assert torch.count_nonzero(token) == 3, f"String '{string}' maps to more than a single token. Please use another string" token = token[0, 1] return token def get_embedding_for_clip_token(embedder, token): return embedder(token.unsqueeze(0))[0, 0] class EmbeddingManager(nn.Module): def __init__( self, embedder, placeholder_strings=None, initializer_words=None, per_image_tokens=False, num_vectors_per_token=1, progressive_words=False, **kwargs ): super().__init__() self.string_to_token_dict = {} self.string_to_param_dict = nn.ParameterDict() self.initial_embeddings = nn.ParameterDict() # These should not be optimized self.progressive_words = progressive_words self.progressive_counter = 0 self.max_vectors_per_token = num_vectors_per_token if hasattr(embedder, 'tokenizer'): # using Stable Diffusion's CLIP encoder self.is_clip = True get_token_for_string = partial(get_clip_token_for_string, embedder.tokenizer) get_embedding_for_tkn = partial(get_embedding_for_clip_token, embedder.transformer.text_model.embeddings) token_dim = 768 else: # using LDM's BERT encoder self.is_clip = False get_token_for_string = partial(get_bert_token_for_string, embedder.tknz_fn) get_embedding_for_tkn = embedder.transformer.token_emb token_dim = 1280 if per_image_tokens: placeholder_strings.extend(per_img_token_list) for idx, placeholder_string in enumerate(placeholder_strings): token = get_token_for_string(placeholder_string) if initializer_words and idx < len(initializer_words): init_word_token = get_token_for_string(initializer_words[idx]) with torch.no_grad(): init_word_embedding = get_embedding_for_tkn(init_word_token.cpu()) token_params = torch.nn.Parameter(init_word_embedding.unsqueeze(0).repeat(num_vectors_per_token, 1), requires_grad=True) self.initial_embeddings[placeholder_string] = torch.nn.Parameter(init_word_embedding.unsqueeze(0).repeat(num_vectors_per_token, 1), requires_grad=False) else: token_params = torch.nn.Parameter(torch.rand(size=(num_vectors_per_token, token_dim), requires_grad=True)) self.string_to_token_dict[placeholder_string] = token self.string_to_param_dict[placeholder_string] = token_params def forward( self, tokenized_text, embedded_text, ): b, n, device = *tokenized_text.shape, tokenized_text.device for placeholder_string, placeholder_token in self.string_to_token_dict.items(): placeholder_embedding = self.string_to_param_dict[placeholder_string].to(device) if self.max_vectors_per_token == 1: # If there's only one vector per token, we can do a simple replacement placeholder_idx = torch.where(tokenized_text == placeholder_token.to(device)) embedded_text[placeholder_idx] = placeholder_embedding else: # otherwise, need to insert and keep track of changing indices if self.progressive_words: self.progressive_counter += 1 max_step_tokens = 1 + self.progressive_counter // PROGRESSIVE_SCALE else: max_step_tokens = self.max_vectors_per_token num_vectors_for_token = min(placeholder_embedding.shape[0], max_step_tokens) placeholder_rows, placeholder_cols = torch.where(tokenized_text == placeholder_token.to(device)) if placeholder_rows.nelement() == 0: continue sorted_cols, sort_idx = torch.sort(placeholder_cols, descending=True) sorted_rows = placeholder_rows[sort_idx] for idx in range(len(sorted_rows)): row = sorted_rows[idx] col = sorted_cols[idx] new_token_row = torch.cat([tokenized_text[row][:col], placeholder_token.repeat(num_vectors_for_token).to(device), tokenized_text[row][col + 1:]], axis=0)[:n] new_embed_row = torch.cat([embedded_text[row][:col], placeholder_embedding[:num_vectors_for_token], embedded_text[row][col + 1:]], axis=0)[:n] embedded_text[row] = new_embed_row tokenized_text[row] = new_token_row return embedded_text def save(self, ckpt_path): torch.save({"string_to_token": self.string_to_token_dict, "string_to_param": self.string_to_param_dict}, ckpt_path) def load(self, ckpt_path): ckpt = torch.load(ckpt_path, map_location='cpu') self.string_to_token_dict = ckpt["string_to_token"] self.string_to_param_dict = ckpt["string_to_param"] def get_embedding_norms_squared(self): all_params = torch.cat(list(self.string_to_param_dict.values()), axis=0) # num_placeholders x embedding_dim param_norm_squared = (all_params * all_params).sum(axis=-1) # num_placeholders return param_norm_squared def embedding_parameters(self): return self.string_to_param_dict.parameters() def embedding_to_coarse_loss(self): loss = 0. num_embeddings = len(self.initial_embeddings) for key in self.initial_embeddings: optimized = self.string_to_param_dict[key] coarse = self.initial_embeddings[key].clone().to(optimized.device) loss = loss + (optimized - coarse) @ (optimized - coarse).T / num_embeddings return loss ================================================ FILE: ldm/modules/encoders/__init__.py ================================================ ================================================ FILE: ldm/modules/encoders/modules.py ================================================ import torch import torch.nn as nn from functools import partial import clip from einops import rearrange, repeat from transformers import CLIPTokenizer, CLIPTextModel import kornia from ldm.modules.x_transformer import Encoder, TransformerWrapper # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test def _expand_mask(mask, dtype, tgt_len = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ bsz, src_len = mask.size() tgt_len = tgt_len if tgt_len is not None else src_len expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) inverted_mask = 1.0 - expanded_mask return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) def _build_causal_attention_mask(bsz, seq_len, dtype): # lazily create causal attention mask, with full attention between the vision tokens # pytorch uses additive attention mask; fill with -inf mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype) mask.fill_(torch.tensor(torch.finfo(dtype).min)) mask.triu_(1) # zero out the lower diagonal mask = mask.unsqueeze(1) # expand mask return mask class AbstractEncoder(nn.Module): def __init__(self): super().__init__() def encode(self, *args, **kwargs): raise NotImplementedError class ClassEmbedder(nn.Module): def __init__(self, embed_dim, n_classes=1000, key='class'): super().__init__() self.key = key self.embedding = nn.Embedding(n_classes, embed_dim) def forward(self, batch, key=None): if key is None: key = self.key # this is for use in crossattn c = batch[key][:, None] c = self.embedding(c) return c class TransformerEmbedder(AbstractEncoder): """Some transformer encoder layers""" def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"): super().__init__() self.device = device self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len, attn_layers=Encoder(dim=n_embed, depth=n_layer)) def forward(self, tokens): tokens = tokens.to(self.device) # meh z = self.transformer(tokens, return_embeddings=True) return z def encode(self, x): return self(x) class BERTTokenizer(AbstractEncoder): """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)""" def __init__(self, device="cuda", vq_interface=True, max_length=77): super().__init__() from transformers import BertTokenizerFast # TODO: add to reuquirements self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") self.device = device self.vq_interface = vq_interface self.max_length = max_length def forward(self, text): batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True, return_overflowing_tokens=False, padding="max_length", return_tensors="pt") tokens = batch_encoding["input_ids"].to(self.device) return tokens @torch.no_grad() def encode(self, text): tokens = self(text) if not self.vq_interface: return tokens return None, None, [None, None, tokens] def decode(self, text): return text class BERTEmbedder(AbstractEncoder): """Uses the BERT tokenizr model and add some transformer encoder layers""" def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77, device="cuda",use_tokenizer=True, embedding_dropout=0.0): super().__init__() self.use_tknz_fn = use_tokenizer if self.use_tknz_fn: self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len) self.device = device self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len, attn_layers=Encoder(dim=n_embed, depth=n_layer), emb_dropout=embedding_dropout) def forward(self, text, embedding_manager=None): if self.use_tknz_fn: tokens = self.tknz_fn(text)#.to(self.device) else: tokens = text z = self.transformer(tokens, return_embeddings=True, embedding_manager=embedding_manager) return z def encode(self, text, **kwargs): # output of length 77 return self(text, **kwargs) class SpatialRescaler(nn.Module): def __init__(self, n_stages=1, method='bilinear', multiplier=0.5, in_channels=3, out_channels=None, bias=False): super().__init__() self.n_stages = n_stages assert self.n_stages >= 0 assert method in ['nearest','linear','bilinear','trilinear','bicubic','area'] self.multiplier = multiplier self.interpolator = partial(torch.nn.functional.interpolate, mode=method) self.remap_output = out_channels is not None if self.remap_output: print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.') self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias) def forward(self,x): for stage in range(self.n_stages): x = self.interpolator(x, scale_factor=self.multiplier) if self.remap_output: x = self.channel_mapper(x) return x def encode(self, x): return self(x) class FrozenCLIPEmbedder(AbstractEncoder): """Uses the CLIP transformer encoder for text (from Hugging Face)""" def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77): super().__init__() self.tokenizer = CLIPTokenizer.from_pretrained(version) self.transformer = CLIPTextModel.from_pretrained(version) self.device = device self.max_length = max_length def embedding_forward( self, input_ids = None, position_ids = None, inputs_embeds = None, embedding_manager = None, ) -> torch.Tensor: seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] if position_ids is None: position_ids = self.position_ids[:, :seq_length] if inputs_embeds is None: inputs_embeds = self.token_embedding(input_ids) if embedding_manager is not None: inputs_embeds = embedding_manager(input_ids, inputs_embeds) position_embeddings = self.position_embedding(position_ids) embeddings = inputs_embeds + position_embeddings return embeddings self.transformer.text_model.embeddings.forward = embedding_forward.__get__(self.transformer.text_model.embeddings) def encoder_forward( self, inputs_embeds, attention_mask = None, causal_attention_mask = None, output_attentions = None, output_hidden_states = None, return_dict = None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) layer_outputs = encoder_layer( hidden_states, attention_mask, causal_attention_mask, output_attentions=output_attentions, ) hidden_states = layer_outputs[0] if output_attentions: all_attentions = all_attentions + (layer_outputs[1],) if output_hidden_states: encoder_states = encoder_states + (hidden_states,) return hidden_states self.transformer.text_model.encoder.forward = encoder_forward.__get__(self.transformer.text_model.encoder) def text_encoder_forward( self, input_ids = None, attention_mask = None, position_ids = None, output_attentions = None, output_hidden_states = None, return_dict = None, embedding_manager = None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is None: raise ValueError("You have to specify either input_ids") input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids, embedding_manager=embedding_manager) bsz, seq_len = input_shape # CLIP's text model uses causal mask, prepare it here. # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 causal_attention_mask = _build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to( hidden_states.device ) # expand attention_mask if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] attention_mask = _expand_mask(attention_mask, hidden_states.dtype) last_hidden_state = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) last_hidden_state = self.final_layer_norm(last_hidden_state) return last_hidden_state self.transformer.text_model.forward = text_encoder_forward.__get__(self.transformer.text_model) def transformer_forward( self, input_ids = None, attention_mask = None, position_ids = None, output_attentions = None, output_hidden_states = None, return_dict = None, embedding_manager = None, ): return self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, embedding_manager = embedding_manager ) self.transformer.forward = transformer_forward.__get__(self.transformer) def freeze(self): self.transformer = self.transformer.eval() for param in self.parameters(): param.requires_grad = False def forward(self, text, **kwargs): # import ipdb; ipdb.set_trace() batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True, return_overflowing_tokens=False, padding="max_length", return_tensors="pt") tokens = batch_encoding["input_ids"].to(self.device) z = self.transformer(input_ids=tokens, **kwargs) return z def encode(self, text, **kwargs): return self(text, **kwargs) class FrozenCLIPTextEmbedder(nn.Module): """ Uses the CLIP transformer encoder for text. """ def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True): super().__init__() self.model, _ = clip.load(version, jit=False, device="cpu") self.device = device self.max_length = max_length self.n_repeat = n_repeat self.normalize = normalize def freeze(self): self.model = self.model.eval() for param in self.parameters(): param.requires_grad = False def forward(self, text): tokens = clip.tokenize(text).to(self.device) z = self.model.encode_text(tokens) if self.normalize: z = z / torch.linalg.norm(z, dim=1, keepdim=True) return z def encode(self, text): z = self(text) if z.ndim==2: z = z[:, None, :] z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat) return z class FrozenClipImageEmbedder(nn.Module): """ Uses the CLIP image encoder. """ def __init__( self, model, jit=False, device='cuda' if torch.cuda.is_available() else 'cpu', antialias=False, ): super().__init__() self.model, _ = clip.load(name=model, device=device, jit=jit) self.antialias = antialias self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False) self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False) def preprocess(self, x): # normalize to [0,1] x = kornia.geometry.resize(x, (224, 224), interpolation='bicubic',align_corners=True, antialias=self.antialias) x = (x + 1.) / 2. # renormalize according to clip x = kornia.enhance.normalize(x, self.mean, self.std) return x def forward(self, x): # x is assumed to be in range [-1,1] return self.model.encode_image(self.preprocess(x)) if __name__ == "__main__": from ldm.util import count_params model = FrozenCLIPEmbedder() count_params(model, verbose=True) ================================================ FILE: ldm/modules/encoders/modules_bak.py ================================================ import torch import torch.nn as nn from functools import partial import clip from einops import rearrange, repeat from transformers import CLIPTokenizer, CLIPTextModel import kornia from ldm.modules.x_transformer import Encoder, TransformerWrapper # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test def _expand_mask(mask, dtype, tgt_len = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ bsz, src_len = mask.size() tgt_len = tgt_len if tgt_len is not None else src_len expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) inverted_mask = 1.0 - expanded_mask return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) def _build_causal_attention_mask(bsz, seq_len, dtype): # lazily create causal attention mask, with full attention between the vision tokens # pytorch uses additive attention mask; fill with -inf mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype) mask.fill_(torch.tensor(torch.finfo(dtype).min)) mask.triu_(1) # zero out the lower diagonal mask = mask.unsqueeze(1) # expand mask return mask class AbstractEncoder(nn.Module): def __init__(self): super().__init__() def encode(self, *args, **kwargs): raise NotImplementedError class ClassEmbedder(nn.Module): def __init__(self, embed_dim, n_classes=1000, key='class'): super().__init__() self.key = key self.embedding = nn.Embedding(n_classes, embed_dim) def forward(self, batch, key=None): if key is None: key = self.key # this is for use in crossattn c = batch[key][:, None] c = self.embedding(c) return c class TransformerEmbedder(AbstractEncoder): """Some transformer encoder layers""" def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"): super().__init__() self.device = device self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len, attn_layers=Encoder(dim=n_embed, depth=n_layer)) def forward(self, tokens): tokens = tokens.to(self.device) # meh z = self.transformer(tokens, return_embeddings=True) return z def encode(self, x): return self(x) class BERTTokenizer(AbstractEncoder): """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)""" def __init__(self, device="cuda", vq_interface=True, max_length=77): super().__init__() from transformers import BertTokenizerFast # TODO: add to reuquirements self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") self.device = device self.vq_interface = vq_interface self.max_length = max_length def forward(self, text): batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True, return_overflowing_tokens=False, padding="max_length", return_tensors="pt") tokens = batch_encoding["input_ids"].to(self.device) return tokens @torch.no_grad() def encode(self, text): tokens = self(text) if not self.vq_interface: return tokens return None, None, [None, None, tokens] def decode(self, text): return text class BERTEmbedder(AbstractEncoder): """Uses the BERT tokenizr model and add some transformer encoder layers""" def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77, device="cuda",use_tokenizer=True, embedding_dropout=0.0): super().__init__() self.use_tknz_fn = use_tokenizer if self.use_tknz_fn: self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len) self.device = device self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len, attn_layers=Encoder(dim=n_embed, depth=n_layer), emb_dropout=embedding_dropout) def forward(self, text, embedding_manager=None): if self.use_tknz_fn: tokens = self.tknz_fn(text)#.to(self.device) else: tokens = text z = self.transformer(tokens, return_embeddings=True, embedding_manager=embedding_manager) return z def encode(self, text, **kwargs): # output of length 77 return self(text, **kwargs) class SpatialRescaler(nn.Module): def __init__(self, n_stages=1, method='bilinear', multiplier=0.5, in_channels=3, out_channels=None, bias=False): super().__init__() self.n_stages = n_stages assert self.n_stages >= 0 assert method in ['nearest','linear','bilinear','trilinear','bicubic','area'] self.multiplier = multiplier self.interpolator = partial(torch.nn.functional.interpolate, mode=method) self.remap_output = out_channels is not None if self.remap_output: print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.') self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias) def forward(self,x): for stage in range(self.n_stages): x = self.interpolator(x, scale_factor=self.multiplier) if self.remap_output: x = self.channel_mapper(x) return x def encode(self, x): return self(x) class FrozenCLIPEmbedder(AbstractEncoder): """Uses the CLIP transformer encoder for text (from Hugging Face)""" def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77): super().__init__() self.tokenizer = CLIPTokenizer.from_pretrained(version) self.transformer = CLIPTextModel.from_pretrained(version) self.device = device self.max_length = max_length self.freeze() def embedding_forward( self, input_ids = None, position_ids = None, inputs_embeds = None, embedding_manager = None, ) -> torch.Tensor: seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] if position_ids is None: position_ids = self.position_ids[:, :seq_length] if inputs_embeds is None: inputs_embeds = self.token_embedding(input_ids) if embedding_manager is not None: inputs_embeds = embedding_manager(input_ids, inputs_embeds) position_embeddings = self.position_embedding(position_ids) embeddings = inputs_embeds + position_embeddings return embeddings self.transformer.text_model.embeddings.forward = embedding_forward.__get__(self.transformer.text_model.embeddings) def encoder_forward( self, inputs_embeds, attention_mask = None, causal_attention_mask = None, output_attentions = None, output_hidden_states = None, return_dict = None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): if output_hidden_states: encoder_states = encoder_states + (hidden_states,) layer_outputs = encoder_layer( hidden_states, attention_mask, causal_attention_mask, output_attentions=output_attentions, ) hidden_states = layer_outputs[0] if output_attentions: all_attentions = all_attentions + (layer_outputs[1],) if output_hidden_states: encoder_states = encoder_states + (hidden_states,) return hidden_states self.transformer.text_model.encoder.forward = encoder_forward.__get__(self.transformer.text_model.encoder) def text_encoder_forward( self, input_ids = None, attention_mask = None, position_ids = None, output_attentions = None, output_hidden_states = None, return_dict = None, embedding_manager = None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is None: raise ValueError("You have to specify either input_ids") input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids, embedding_manager=embedding_manager) bsz, seq_len = input_shape # CLIP's text model uses causal mask, prepare it here. # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 causal_attention_mask = _build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to( hidden_states.device ) # expand attention_mask if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] attention_mask = _expand_mask(attention_mask, hidden_states.dtype) last_hidden_state = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) last_hidden_state = self.final_layer_norm(last_hidden_state) return last_hidden_state self.transformer.text_model.forward = text_encoder_forward.__get__(self.transformer.text_model) def transformer_forward( self, input_ids = None, attention_mask = None, position_ids = None, output_attentions = None, output_hidden_states = None, return_dict = None, embedding_manager = None, ): return self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, embedding_manager = embedding_manager ) self.transformer.forward = transformer_forward.__get__(self.transformer) # def update_embedding_func(self, embedding_manager): # text_model = self.transformer.text_model # # text_model.old_embeddings = text_model.embeddings # # def new_embeddings( # # input_ids = None, # # position_ids = None, # # inputs_embeds = None, # # ) -> torch.Tensor: # # seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] # # if position_ids is None: # # position_ids = text_model.old_embeddings.position_ids[:, :seq_length] # # if inputs_embeds is None: # # inputs_embeds = text_model.old_embeddings.token_embedding(input_ids) # # inputs_embeds = embedding_manager(input_ids, inputs_embeds) # # position_embeddings = text_model.old_embeddings.position_embedding(position_ids) # # embeddings = inputs_embeds + position_embeddings # # return embeddings # # del text_model.embeddings # # text_model.embeddings = new_embeddings # # class NewEmbeddings(torch.nn.Module): # # def __init__(self, orig_embedder): # # super().__init__() # # self.orig_embedder = orig_embedder # # def forward( # # self, # # input_ids = None, # # position_ids = None, # # inputs_embeds = None, # # ) -> torch.Tensor: # # seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] # # if position_ids is None: # # position_ids = self.orig_embedder.position_ids[:, :seq_length] # # if inputs_embeds is None: # # inputs_embeds = self.orig_embedder.token_embedding(input_ids) # # inputs_embeds = embedding_manager(input_ids, inputs_embeds) # # position_embeddings = self.orig_embedder.position_embedding(position_ids) # # embeddings = inputs_embeds + position_embeddings # # return embeddings # # # self.new_embeddings = # # # text_model.embeddings = new_embeddings.__call__.__get__(text_model) # # text_model.embeddings = NewEmbeddings(text_model.embeddings) # class NewEmbeddings(torch.nn.Module): # def __init__(self, orig_embedder, embedding_manager): # super().__init__() # self.embedding_manager = embedding_manager # self.orig_embedder = orig_embedder # def forward( # self, # input_ids = None, # position_ids = None, # inputs_embeds = None, # ) -> torch.Tensor: # seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] # if position_ids is None: # position_ids = self.orig_embedder.position_ids[:, :seq_length] # if inputs_embeds is None: # inputs_embeds = self.orig_embedder.token_embedding(input_ids) # # init_embeds = inputs_embeds.clone() # inputs_embeds = self.embedding_manager(input_ids, inputs_embeds) # # print(inputs_embeds - init_embeds) # # print((inputs_embeds - init_embeds).max()) # # exit(0) # position_embeddings = self.orig_embedder.position_embedding(position_ids) # embeddings = inputs_embeds + position_embeddings # return embeddings # # self.new_embeddings = # # text_model.embeddings = new_embeddings.__call__.__get__(text_model) # text_model.embeddings = NewEmbeddings(text_model.embeddings, embedding_manager) def freeze(self): self.transformer = self.transformer.eval() for param in self.parameters(): param.requires_grad = False def forward(self, text, **kwargs): batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True, return_overflowing_tokens=False, padding="max_length", return_tensors="pt") tokens = batch_encoding["input_ids"].to(self.device) z = self.transformer(input_ids=tokens, **kwargs) return z def encode(self, text, **kwargs): return self(text, **kwargs) class FrozenCLIPTextEmbedder(nn.Module): """ Uses the CLIP transformer encoder for text. """ def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True): super().__init__() self.model, _ = clip.load(version, jit=False, device="cpu") self.device = device self.max_length = max_length self.n_repeat = n_repeat self.normalize = normalize def freeze(self): self.model = self.model.eval() for param in self.parameters(): param.requires_grad = False def forward(self, text): tokens = clip.tokenize(text).to(self.device) z = self.model.encode_text(tokens) if self.normalize: z = z / torch.linalg.norm(z, dim=1, keepdim=True) return z def encode(self, text): z = self(text) if z.ndim==2: z = z[:, None, :] z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat) return z class FrozenClipImageEmbedder(nn.Module): """ Uses the CLIP image encoder. """ def __init__( self, model, jit=False, device='cuda' if torch.cuda.is_available() else 'cpu', antialias=False, ): super().__init__() self.model, _ = clip.load(name=model, device=device, jit=jit) self.antialias = antialias self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False) self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False) def preprocess(self, x): # normalize to [0,1] x = kornia.geometry.resize(x, (224, 224), interpolation='bicubic',align_corners=True, antialias=self.antialias) x = (x + 1.) / 2. # renormalize according to clip x = kornia.enhance.normalize(x, self.mean, self.std) return x def forward(self, x): # x is assumed to be in range [-1,1] return self.model.encode_image(self.preprocess(x)) if __name__ == "__main__": from ldm.util import count_params model = FrozenCLIPEmbedder() count_params(model, verbose=True) ================================================ FILE: ldm/modules/image_degradation/__init__.py ================================================ from ldm.modules.image_degradation.bsrgan import degradation_bsrgan_variant as degradation_fn_bsr from ldm.modules.image_degradation.bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light ================================================ FILE: ldm/modules/image_degradation/bsrgan.py ================================================ # -*- coding: utf-8 -*- """ # -------------------------------------------- # Super-Resolution # -------------------------------------------- # # Kai Zhang (cskaizhang@gmail.com) # https://github.com/cszn # From 2019/03--2021/08 # -------------------------------------------- """ import numpy as np import cv2 import torch from functools import partial import random from scipy import ndimage import scipy import scipy.stats as ss from scipy.interpolate import interp2d from scipy.linalg import orth import albumentations import ldm.modules.image_degradation.utils_image as util def modcrop_np(img, sf): ''' Args: img: numpy image, WxH or WxHxC sf: scale factor Return: cropped image ''' w, h = img.shape[:2] im = np.copy(img) return im[:w - w % sf, :h - h % sf, ...] """ # -------------------------------------------- # anisotropic Gaussian kernels # -------------------------------------------- """ def analytic_kernel(k): """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)""" k_size = k.shape[0] # Calculate the big kernels size big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2)) # Loop over the small kernel to fill the big one for r in range(k_size): for c in range(k_size): big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k # Crop the edges of the big kernel to ignore very small values and increase run time of SR crop = k_size // 2 cropped_big_k = big_k[crop:-crop, crop:-crop] # Normalize to 1 return cropped_big_k / cropped_big_k.sum() def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6): """ generate an anisotropic Gaussian kernel Args: ksize : e.g., 15, kernel size theta : [0, pi], rotation angle range l1 : [0.1,50], scaling of eigenvalues l2 : [0.1,l1], scaling of eigenvalues If l1 = l2, will get an isotropic Gaussian kernel. Returns: k : kernel """ v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.])) V = np.array([[v[0], v[1]], [v[1], -v[0]]]) D = np.array([[l1, 0], [0, l2]]) Sigma = np.dot(np.dot(V, D), np.linalg.inv(V)) k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize) return k def gm_blur_kernel(mean, cov, size=15): center = size / 2.0 + 0.5 k = np.zeros([size, size]) for y in range(size): for x in range(size): cy = y - center + 1 cx = x - center + 1 k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov) k = k / np.sum(k) return k def shift_pixel(x, sf, upper_left=True): """shift pixel for super-resolution with different scale factors Args: x: WxHxC or WxH sf: scale factor upper_left: shift direction """ h, w = x.shape[:2] shift = (sf - 1) * 0.5 xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0) if upper_left: x1 = xv + shift y1 = yv + shift else: x1 = xv - shift y1 = yv - shift x1 = np.clip(x1, 0, w - 1) y1 = np.clip(y1, 0, h - 1) if x.ndim == 2: x = interp2d(xv, yv, x)(x1, y1) if x.ndim == 3: for i in range(x.shape[-1]): x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1) return x def blur(x, k): ''' x: image, NxcxHxW k: kernel, Nx1xhxw ''' n, c = x.shape[:2] p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2 x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate') k = k.repeat(1, c, 1, 1) k = k.view(-1, 1, k.shape[2], k.shape[3]) x = x.view(1, -1, x.shape[2], x.shape[3]) x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c) x = x.view(n, c, x.shape[2], x.shape[3]) return x def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0): """" # modified version of https://github.com/assafshocher/BlindSR_dataset_generator # Kai Zhang # min_var = 0.175 * sf # variance of the gaussian kernel will be sampled between min_var and max_var # max_var = 2.5 * sf """ # Set random eigen-vals (lambdas) and angle (theta) for COV matrix lambda_1 = min_var + np.random.rand() * (max_var - min_var) lambda_2 = min_var + np.random.rand() * (max_var - min_var) theta = np.random.rand() * np.pi # random theta noise = -noise_level + np.random.rand(*k_size) * noise_level * 2 # Set COV matrix using Lambdas and Theta LAMBDA = np.diag([lambda_1, lambda_2]) Q = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) SIGMA = Q @ LAMBDA @ Q.T INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :] # Set expectation position (shifting kernel for aligned image) MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2) MU = MU[None, None, :, None] # Create meshgrid for Gaussian [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1])) Z = np.stack([X, Y], 2)[:, :, :, None] # Calcualte Gaussian for every pixel of the kernel ZZ = Z - MU ZZ_t = ZZ.transpose(0, 1, 3, 2) raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise) # shift the kernel so it will be centered # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor) # Normalize the kernel and return # kernel = raw_kernel_centered / np.sum(raw_kernel_centered) kernel = raw_kernel / np.sum(raw_kernel) return kernel def fspecial_gaussian(hsize, sigma): hsize = [hsize, hsize] siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0] std = sigma [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1)) arg = -(x * x + y * y) / (2 * std * std) h = np.exp(arg) h[h < scipy.finfo(float).eps * h.max()] = 0 sumh = h.sum() if sumh != 0: h = h / sumh return h def fspecial_laplacian(alpha): alpha = max([0, min([alpha, 1])]) h1 = alpha / (alpha + 1) h2 = (1 - alpha) / (alpha + 1) h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]] h = np.array(h) return h def fspecial(filter_type, *args, **kwargs): ''' python code from: https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py ''' if filter_type == 'gaussian': return fspecial_gaussian(*args, **kwargs) if filter_type == 'laplacian': return fspecial_laplacian(*args, **kwargs) """ # -------------------------------------------- # degradation models # -------------------------------------------- """ def bicubic_degradation(x, sf=3): ''' Args: x: HxWxC image, [0, 1] sf: down-scale factor Return: bicubicly downsampled LR image ''' x = util.imresize_np(x, scale=1 / sf) return x def srmd_degradation(x, k, sf=3): ''' blur + bicubic downsampling Args: x: HxWxC image, [0, 1] k: hxw, double sf: down-scale factor Return: downsampled LR image Reference: @inproceedings{zhang2018learning, title={Learning a single convolutional super-resolution network for multiple degradations}, author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei}, booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, pages={3262--3271}, year={2018} } ''' x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # 'nearest' | 'mirror' x = bicubic_degradation(x, sf=sf) return x def dpsr_degradation(x, k, sf=3): ''' bicubic downsampling + blur Args: x: HxWxC image, [0, 1] k: hxw, double sf: down-scale factor Return: downsampled LR image Reference: @inproceedings{zhang2019deep, title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels}, author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei}, booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, pages={1671--1681}, year={2019} } ''' x = bicubic_degradation(x, sf=sf) x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') return x def classical_degradation(x, k, sf=3): ''' blur + downsampling Args: x: HxWxC image, [0, 1]/[0, 255] k: hxw, double sf: down-scale factor Return: downsampled LR image ''' x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2)) st = 0 return x[st::sf, st::sf, ...] def add_sharpening(img, weight=0.5, radius=50, threshold=10): """USM sharpening. borrowed from real-ESRGAN Input image: I; Blurry image: B. 1. K = I + weight * (I - B) 2. Mask = 1 if abs(I - B) > threshold, else: 0 3. Blur mask: 4. Out = Mask * K + (1 - Mask) * I Args: img (Numpy array): Input image, HWC, BGR; float32, [0, 1]. weight (float): Sharp weight. Default: 1. radius (float): Kernel size of Gaussian blur. Default: 50. threshold (int): """ if radius % 2 == 0: radius += 1 blur = cv2.GaussianBlur(img, (radius, radius), 0) residual = img - blur mask = np.abs(residual) * 255 > threshold mask = mask.astype('float32') soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0) K = img + weight * residual K = np.clip(K, 0, 1) return soft_mask * K + (1 - soft_mask) * img def add_blur(img, sf=4): wd2 = 4.0 + sf wd = 2.0 + 0.2 * sf if random.random() < 0.5: l1 = wd2 * random.random() l2 = wd2 * random.random() k = anisotropic_Gaussian(ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2) else: k = fspecial('gaussian', 2 * random.randint(2, 11) + 3, wd * random.random()) img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror') return img def add_resize(img, sf=4): rnum = np.random.rand() if rnum > 0.8: # up sf1 = random.uniform(1, 2) elif rnum < 0.7: # down sf1 = random.uniform(0.5 / sf, 1) else: sf1 = 1.0 img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3])) img = np.clip(img, 0.0, 1.0) return img # def add_Gaussian_noise(img, noise_level1=2, noise_level2=25): # noise_level = random.randint(noise_level1, noise_level2) # rnum = np.random.rand() # if rnum > 0.6: # add color Gaussian noise # img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32) # elif rnum < 0.4: # add grayscale Gaussian noise # img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32) # else: # add noise # L = noise_level2 / 255. # D = np.diag(np.random.rand(3)) # U = orth(np.random.rand(3, 3)) # conv = np.dot(np.dot(np.transpose(U), D), U) # img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32) # img = np.clip(img, 0.0, 1.0) # return img def add_Gaussian_noise(img, noise_level1=2, noise_level2=25): noise_level = random.randint(noise_level1, noise_level2) rnum = np.random.rand() if rnum > 0.6: # add color Gaussian noise img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32) elif rnum < 0.4: # add grayscale Gaussian noise img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32) else: # add noise L = noise_level2 / 255. D = np.diag(np.random.rand(3)) U = orth(np.random.rand(3, 3)) conv = np.dot(np.dot(np.transpose(U), D), U) img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32) img = np.clip(img, 0.0, 1.0) return img def add_speckle_noise(img, noise_level1=2, noise_level2=25): noise_level = random.randint(noise_level1, noise_level2) img = np.clip(img, 0.0, 1.0) rnum = random.random() if rnum > 0.6: img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32) elif rnum < 0.4: img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32) else: L = noise_level2 / 255. D = np.diag(np.random.rand(3)) U = orth(np.random.rand(3, 3)) conv = np.dot(np.dot(np.transpose(U), D), U) img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32) img = np.clip(img, 0.0, 1.0) return img def add_Poisson_noise(img): img = np.clip((img * 255.0).round(), 0, 255) / 255. vals = 10 ** (2 * random.random() + 2.0) # [2, 4] if random.random() < 0.5: img = np.random.poisson(img * vals).astype(np.float32) / vals else: img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114]) img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255. noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray img += noise_gray[:, :, np.newaxis] img = np.clip(img, 0.0, 1.0) return img def add_JPEG_noise(img): quality_factor = random.randint(30, 95) img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR) result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor]) img = cv2.imdecode(encimg, 1) img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB) return img def random_crop(lq, hq, sf=4, lq_patchsize=64): h, w = lq.shape[:2] rnd_h = random.randint(0, h - lq_patchsize) rnd_w = random.randint(0, w - lq_patchsize) lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :] rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf) hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :] return lq, hq def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): """ This is the degradation model of BSRGAN from the paper "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution" ---------- img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf) sf: scale factor isp_model: camera ISP model Returns ------- img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1] hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1] """ isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25 sf_ori = sf h1, w1 = img.shape[:2] img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop h, w = img.shape[:2] if h < lq_patchsize * sf or w < lq_patchsize * sf: raise ValueError(f'img size ({h1}X{w1}) is too small!') hq = img.copy() if sf == 4 and random.random() < scale2_prob: # downsample1 if np.random.rand() < 0.5: img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])), interpolation=random.choice([1, 2, 3])) else: img = util.imresize_np(img, 1 / 2, True) img = np.clip(img, 0.0, 1.0) sf = 2 shuffle_order = random.sample(range(7), 7) idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3) if idx1 > idx2: # keep downsample3 last shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1] for i in shuffle_order: if i == 0: img = add_blur(img, sf=sf) elif i == 1: img = add_blur(img, sf=sf) elif i == 2: a, b = img.shape[1], img.shape[0] # downsample2 if random.random() < 0.75: sf1 = random.uniform(1, 2 * sf) img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3])) else: k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf)) k_shifted = shift_pixel(k, sf) k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror') img = img[0::sf, 0::sf, ...] # nearest downsampling img = np.clip(img, 0.0, 1.0) elif i == 3: # downsample3 img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3])) img = np.clip(img, 0.0, 1.0) elif i == 4: # add Gaussian noise img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25) elif i == 5: # add JPEG noise if random.random() < jpeg_prob: img = add_JPEG_noise(img) elif i == 6: # add processed camera sensor noise if random.random() < isp_prob and isp_model is not None: with torch.no_grad(): img, hq = isp_model.forward(img.copy(), hq) # add final JPEG compression noise img = add_JPEG_noise(img) # random crop img, hq = random_crop(img, hq, sf_ori, lq_patchsize) return img, hq # todo no isp_model? def degradation_bsrgan_variant(image, sf=4, isp_model=None): """ This is the degradation model of BSRGAN from the paper "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution" ---------- sf: scale factor isp_model: camera ISP model Returns ------- img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1] hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1] """ image = util.uint2single(image) isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25 sf_ori = sf h1, w1 = image.shape[:2] image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop h, w = image.shape[:2] hq = image.copy() if sf == 4 and random.random() < scale2_prob: # downsample1 if np.random.rand() < 0.5: image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])), interpolation=random.choice([1, 2, 3])) else: image = util.imresize_np(image, 1 / 2, True) image = np.clip(image, 0.0, 1.0) sf = 2 shuffle_order = random.sample(range(7), 7) idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3) if idx1 > idx2: # keep downsample3 last shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1] for i in shuffle_order: if i == 0: image = add_blur(image, sf=sf) elif i == 1: image = add_blur(image, sf=sf) elif i == 2: a, b = image.shape[1], image.shape[0] # downsample2 if random.random() < 0.75: sf1 = random.uniform(1, 2 * sf) image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])), interpolation=random.choice([1, 2, 3])) else: k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf)) k_shifted = shift_pixel(k, sf) k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror') image = image[0::sf, 0::sf, ...] # nearest downsampling image = np.clip(image, 0.0, 1.0) elif i == 3: # downsample3 image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3])) image = np.clip(image, 0.0, 1.0) elif i == 4: # add Gaussian noise image = add_Gaussian_noise(image, noise_level1=2, noise_level2=25) elif i == 5: # add JPEG noise if random.random() < jpeg_prob: image = add_JPEG_noise(image) # elif i == 6: # # add processed camera sensor noise # if random.random() < isp_prob and isp_model is not None: # with torch.no_grad(): # img, hq = isp_model.forward(img.copy(), hq) # add final JPEG compression noise image = add_JPEG_noise(image) image = util.single2uint(image) example = {"image":image} return example # TODO incase there is a pickle error one needs to replace a += x with a = a + x in add_speckle_noise etc... def degradation_bsrgan_plus(img, sf=4, shuffle_prob=0.5, use_sharp=True, lq_patchsize=64, isp_model=None): """ This is an extended degradation model by combining the degradation models of BSRGAN and Real-ESRGAN ---------- img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf) sf: scale factor use_shuffle: the degradation shuffle use_sharp: sharpening the img Returns ------- img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1] hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1] """ h1, w1 = img.shape[:2] img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop h, w = img.shape[:2] if h < lq_patchsize * sf or w < lq_patchsize * sf: raise ValueError(f'img size ({h1}X{w1}) is too small!') if use_sharp: img = add_sharpening(img) hq = img.copy() if random.random() < shuffle_prob: shuffle_order = random.sample(range(13), 13) else: shuffle_order = list(range(13)) # local shuffle for noise, JPEG is always the last one shuffle_order[2:6] = random.sample(shuffle_order[2:6], len(range(2, 6))) shuffle_order[9:13] = random.sample(shuffle_order[9:13], len(range(9, 13))) poisson_prob, speckle_prob, isp_prob = 0.1, 0.1, 0.1 for i in shuffle_order: if i == 0: img = add_blur(img, sf=sf) elif i == 1: img = add_resize(img, sf=sf) elif i == 2: img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25) elif i == 3: if random.random() < poisson_prob: img = add_Poisson_noise(img) elif i == 4: if random.random() < speckle_prob: img = add_speckle_noise(img) elif i == 5: if random.random() < isp_prob and isp_model is not None: with torch.no_grad(): img, hq = isp_model.forward(img.copy(), hq) elif i == 6: img = add_JPEG_noise(img) elif i == 7: img = add_blur(img, sf=sf) elif i == 8: img = add_resize(img, sf=sf) elif i == 9: img = add_Gaussian_noise(img, noise_level1=2, noise_level2=25) elif i == 10: if random.random() < poisson_prob: img = add_Poisson_noise(img) elif i == 11: if random.random() < speckle_prob: img = add_speckle_noise(img) elif i == 12: if random.random() < isp_prob and isp_model is not None: with torch.no_grad(): img, hq = isp_model.forward(img.copy(), hq) else: print('check the shuffle!') # resize to desired size img = cv2.resize(img, (int(1 / sf * hq.shape[1]), int(1 / sf * hq.shape[0])), interpolation=random.choice([1, 2, 3])) # add final JPEG compression noise img = add_JPEG_noise(img) # random crop img, hq = random_crop(img, hq, sf, lq_patchsize) return img, hq if __name__ == '__main__': print("hey") img = util.imread_uint('utils/test.png', 3) print(img) img = util.uint2single(img) print(img) img = img[:448, :448] h = img.shape[0] // 4 print("resizing to", h) sf = 4 deg_fn = partial(degradation_bsrgan_variant, sf=sf) for i in range(20): print(i) img_lq = deg_fn(img) print(img_lq) img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img)["image"] print(img_lq.shape) print("bicubic", img_lq_bicubic.shape) print(img_hq.shape) lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), interpolation=0) lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), interpolation=0) img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1) util.imsave(img_concat, str(i) + '.png') ================================================ FILE: ldm/modules/image_degradation/bsrgan_light.py ================================================ # -*- coding: utf-8 -*- import numpy as np import cv2 import torch from functools import partial import random from scipy import ndimage import scipy import scipy.stats as ss from scipy.interpolate import interp2d from scipy.linalg import orth import albumentations import ldm.modules.image_degradation.utils_image as util """ # -------------------------------------------- # Super-Resolution # -------------------------------------------- # # Kai Zhang (cskaizhang@gmail.com) # https://github.com/cszn # From 2019/03--2021/08 # -------------------------------------------- """ def modcrop_np(img, sf): ''' Args: img: numpy image, WxH or WxHxC sf: scale factor Return: cropped image ''' w, h = img.shape[:2] im = np.copy(img) return im[:w - w % sf, :h - h % sf, ...] """ # -------------------------------------------- # anisotropic Gaussian kernels # -------------------------------------------- """ def analytic_kernel(k): """Calculate the X4 kernel from the X2 kernel (for proof see appendix in paper)""" k_size = k.shape[0] # Calculate the big kernels size big_k = np.zeros((3 * k_size - 2, 3 * k_size - 2)) # Loop over the small kernel to fill the big one for r in range(k_size): for c in range(k_size): big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k # Crop the edges of the big kernel to ignore very small values and increase run time of SR crop = k_size // 2 cropped_big_k = big_k[crop:-crop, crop:-crop] # Normalize to 1 return cropped_big_k / cropped_big_k.sum() def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6): """ generate an anisotropic Gaussian kernel Args: ksize : e.g., 15, kernel size theta : [0, pi], rotation angle range l1 : [0.1,50], scaling of eigenvalues l2 : [0.1,l1], scaling of eigenvalues If l1 = l2, will get an isotropic Gaussian kernel. Returns: k : kernel """ v = np.dot(np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), np.array([1., 0.])) V = np.array([[v[0], v[1]], [v[1], -v[0]]]) D = np.array([[l1, 0], [0, l2]]) Sigma = np.dot(np.dot(V, D), np.linalg.inv(V)) k = gm_blur_kernel(mean=[0, 0], cov=Sigma, size=ksize) return k def gm_blur_kernel(mean, cov, size=15): center = size / 2.0 + 0.5 k = np.zeros([size, size]) for y in range(size): for x in range(size): cy = y - center + 1 cx = x - center + 1 k[y, x] = ss.multivariate_normal.pdf([cx, cy], mean=mean, cov=cov) k = k / np.sum(k) return k def shift_pixel(x, sf, upper_left=True): """shift pixel for super-resolution with different scale factors Args: x: WxHxC or WxH sf: scale factor upper_left: shift direction """ h, w = x.shape[:2] shift = (sf - 1) * 0.5 xv, yv = np.arange(0, w, 1.0), np.arange(0, h, 1.0) if upper_left: x1 = xv + shift y1 = yv + shift else: x1 = xv - shift y1 = yv - shift x1 = np.clip(x1, 0, w - 1) y1 = np.clip(y1, 0, h - 1) if x.ndim == 2: x = interp2d(xv, yv, x)(x1, y1) if x.ndim == 3: for i in range(x.shape[-1]): x[:, :, i] = interp2d(xv, yv, x[:, :, i])(x1, y1) return x def blur(x, k): ''' x: image, NxcxHxW k: kernel, Nx1xhxw ''' n, c = x.shape[:2] p1, p2 = (k.shape[-2] - 1) // 2, (k.shape[-1] - 1) // 2 x = torch.nn.functional.pad(x, pad=(p1, p2, p1, p2), mode='replicate') k = k.repeat(1, c, 1, 1) k = k.view(-1, 1, k.shape[2], k.shape[3]) x = x.view(1, -1, x.shape[2], x.shape[3]) x = torch.nn.functional.conv2d(x, k, bias=None, stride=1, padding=0, groups=n * c) x = x.view(n, c, x.shape[2], x.shape[3]) return x def gen_kernel(k_size=np.array([15, 15]), scale_factor=np.array([4, 4]), min_var=0.6, max_var=10., noise_level=0): """" # modified version of https://github.com/assafshocher/BlindSR_dataset_generator # Kai Zhang # min_var = 0.175 * sf # variance of the gaussian kernel will be sampled between min_var and max_var # max_var = 2.5 * sf """ # Set random eigen-vals (lambdas) and angle (theta) for COV matrix lambda_1 = min_var + np.random.rand() * (max_var - min_var) lambda_2 = min_var + np.random.rand() * (max_var - min_var) theta = np.random.rand() * np.pi # random theta noise = -noise_level + np.random.rand(*k_size) * noise_level * 2 # Set COV matrix using Lambdas and Theta LAMBDA = np.diag([lambda_1, lambda_2]) Q = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) SIGMA = Q @ LAMBDA @ Q.T INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :] # Set expectation position (shifting kernel for aligned image) MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2) MU = MU[None, None, :, None] # Create meshgrid for Gaussian [X, Y] = np.meshgrid(range(k_size[0]), range(k_size[1])) Z = np.stack([X, Y], 2)[:, :, :, None] # Calcualte Gaussian for every pixel of the kernel ZZ = Z - MU ZZ_t = ZZ.transpose(0, 1, 3, 2) raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise) # shift the kernel so it will be centered # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor) # Normalize the kernel and return # kernel = raw_kernel_centered / np.sum(raw_kernel_centered) kernel = raw_kernel / np.sum(raw_kernel) return kernel def fspecial_gaussian(hsize, sigma): hsize = [hsize, hsize] siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0] std = sigma [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1)) arg = -(x * x + y * y) / (2 * std * std) h = np.exp(arg) h[h < scipy.finfo(float).eps * h.max()] = 0 sumh = h.sum() if sumh != 0: h = h / sumh return h def fspecial_laplacian(alpha): alpha = max([0, min([alpha, 1])]) h1 = alpha / (alpha + 1) h2 = (1 - alpha) / (alpha + 1) h = [[h1, h2, h1], [h2, -4 / (alpha + 1), h2], [h1, h2, h1]] h = np.array(h) return h def fspecial(filter_type, *args, **kwargs): ''' python code from: https://github.com/ronaldosena/imagens-medicas-2/blob/40171a6c259edec7827a6693a93955de2bd39e76/Aulas/aula_2_-_uniform_filter/matlab_fspecial.py ''' if filter_type == 'gaussian': return fspecial_gaussian(*args, **kwargs) if filter_type == 'laplacian': return fspecial_laplacian(*args, **kwargs) """ # -------------------------------------------- # degradation models # -------------------------------------------- """ def bicubic_degradation(x, sf=3): ''' Args: x: HxWxC image, [0, 1] sf: down-scale factor Return: bicubicly downsampled LR image ''' x = util.imresize_np(x, scale=1 / sf) return x def srmd_degradation(x, k, sf=3): ''' blur + bicubic downsampling Args: x: HxWxC image, [0, 1] k: hxw, double sf: down-scale factor Return: downsampled LR image Reference: @inproceedings{zhang2018learning, title={Learning a single convolutional super-resolution network for multiple degradations}, author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei}, booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, pages={3262--3271}, year={2018} } ''' x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # 'nearest' | 'mirror' x = bicubic_degradation(x, sf=sf) return x def dpsr_degradation(x, k, sf=3): ''' bicubic downsampling + blur Args: x: HxWxC image, [0, 1] k: hxw, double sf: down-scale factor Return: downsampled LR image Reference: @inproceedings{zhang2019deep, title={Deep Plug-and-Play Super-Resolution for Arbitrary Blur Kernels}, author={Zhang, Kai and Zuo, Wangmeng and Zhang, Lei}, booktitle={IEEE Conference on Computer Vision and Pattern Recognition}, pages={1671--1681}, year={2019} } ''' x = bicubic_degradation(x, sf=sf) x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') return x def classical_degradation(x, k, sf=3): ''' blur + downsampling Args: x: HxWxC image, [0, 1]/[0, 255] k: hxw, double sf: down-scale factor Return: downsampled LR image ''' x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode='wrap') # x = filters.correlate(x, np.expand_dims(np.flip(k), axis=2)) st = 0 return x[st::sf, st::sf, ...] def add_sharpening(img, weight=0.5, radius=50, threshold=10): """USM sharpening. borrowed from real-ESRGAN Input image: I; Blurry image: B. 1. K = I + weight * (I - B) 2. Mask = 1 if abs(I - B) > threshold, else: 0 3. Blur mask: 4. Out = Mask * K + (1 - Mask) * I Args: img (Numpy array): Input image, HWC, BGR; float32, [0, 1]. weight (float): Sharp weight. Default: 1. radius (float): Kernel size of Gaussian blur. Default: 50. threshold (int): """ if radius % 2 == 0: radius += 1 blur = cv2.GaussianBlur(img, (radius, radius), 0) residual = img - blur mask = np.abs(residual) * 255 > threshold mask = mask.astype('float32') soft_mask = cv2.GaussianBlur(mask, (radius, radius), 0) K = img + weight * residual K = np.clip(K, 0, 1) return soft_mask * K + (1 - soft_mask) * img def add_blur(img, sf=4): wd2 = 4.0 + sf wd = 2.0 + 0.2 * sf wd2 = wd2/4 wd = wd/4 if random.random() < 0.5: l1 = wd2 * random.random() l2 = wd2 * random.random() k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2) else: k = fspecial('gaussian', random.randint(2, 4) + 3, wd * random.random()) img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode='mirror') return img def add_resize(img, sf=4): rnum = np.random.rand() if rnum > 0.8: # up sf1 = random.uniform(1, 2) elif rnum < 0.7: # down sf1 = random.uniform(0.5 / sf, 1) else: sf1 = 1.0 img = cv2.resize(img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3])) img = np.clip(img, 0.0, 1.0) return img # def add_Gaussian_noise(img, noise_level1=2, noise_level2=25): # noise_level = random.randint(noise_level1, noise_level2) # rnum = np.random.rand() # if rnum > 0.6: # add color Gaussian noise # img += np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32) # elif rnum < 0.4: # add grayscale Gaussian noise # img += np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32) # else: # add noise # L = noise_level2 / 255. # D = np.diag(np.random.rand(3)) # U = orth(np.random.rand(3, 3)) # conv = np.dot(np.dot(np.transpose(U), D), U) # img += np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32) # img = np.clip(img, 0.0, 1.0) # return img def add_Gaussian_noise(img, noise_level1=2, noise_level2=25): noise_level = random.randint(noise_level1, noise_level2) rnum = np.random.rand() if rnum > 0.6: # add color Gaussian noise img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32) elif rnum < 0.4: # add grayscale Gaussian noise img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32) else: # add noise L = noise_level2 / 255. D = np.diag(np.random.rand(3)) U = orth(np.random.rand(3, 3)) conv = np.dot(np.dot(np.transpose(U), D), U) img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32) img = np.clip(img, 0.0, 1.0) return img def add_speckle_noise(img, noise_level1=2, noise_level2=25): noise_level = random.randint(noise_level1, noise_level2) img = np.clip(img, 0.0, 1.0) rnum = random.random() if rnum > 0.6: img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32) elif rnum < 0.4: img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32) else: L = noise_level2 / 255. D = np.diag(np.random.rand(3)) U = orth(np.random.rand(3, 3)) conv = np.dot(np.dot(np.transpose(U), D), U) img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L ** 2 * conv), img.shape[:2]).astype(np.float32) img = np.clip(img, 0.0, 1.0) return img def add_Poisson_noise(img): img = np.clip((img * 255.0).round(), 0, 255) / 255. vals = 10 ** (2 * random.random() + 2.0) # [2, 4] if random.random() < 0.5: img = np.random.poisson(img * vals).astype(np.float32) / vals else: img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114]) img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255. noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray img += noise_gray[:, :, np.newaxis] img = np.clip(img, 0.0, 1.0) return img def add_JPEG_noise(img): quality_factor = random.randint(80, 95) img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR) result, encimg = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor]) img = cv2.imdecode(encimg, 1) img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB) return img def random_crop(lq, hq, sf=4, lq_patchsize=64): h, w = lq.shape[:2] rnd_h = random.randint(0, h - lq_patchsize) rnd_w = random.randint(0, w - lq_patchsize) lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :] rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf) hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize * sf, :] return lq, hq def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): """ This is the degradation model of BSRGAN from the paper "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution" ---------- img: HXWXC, [0, 1], its size should be large than (lq_patchsizexsf)x(lq_patchsizexsf) sf: scale factor isp_model: camera ISP model Returns ------- img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1] hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1] """ isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25 sf_ori = sf h1, w1 = img.shape[:2] img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop h, w = img.shape[:2] if h < lq_patchsize * sf or w < lq_patchsize * sf: raise ValueError(f'img size ({h1}X{w1}) is too small!') hq = img.copy() if sf == 4 and random.random() < scale2_prob: # downsample1 if np.random.rand() < 0.5: img = cv2.resize(img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])), interpolation=random.choice([1, 2, 3])) else: img = util.imresize_np(img, 1 / 2, True) img = np.clip(img, 0.0, 1.0) sf = 2 shuffle_order = random.sample(range(7), 7) idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3) if idx1 > idx2: # keep downsample3 last shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1] for i in shuffle_order: if i == 0: img = add_blur(img, sf=sf) elif i == 1: img = add_blur(img, sf=sf) elif i == 2: a, b = img.shape[1], img.shape[0] # downsample2 if random.random() < 0.75: sf1 = random.uniform(1, 2 * sf) img = cv2.resize(img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])), interpolation=random.choice([1, 2, 3])) else: k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf)) k_shifted = shift_pixel(k, sf) k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode='mirror') img = img[0::sf, 0::sf, ...] # nearest downsampling img = np.clip(img, 0.0, 1.0) elif i == 3: # downsample3 img = cv2.resize(img, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3])) img = np.clip(img, 0.0, 1.0) elif i == 4: # add Gaussian noise img = add_Gaussian_noise(img, noise_level1=2, noise_level2=8) elif i == 5: # add JPEG noise if random.random() < jpeg_prob: img = add_JPEG_noise(img) elif i == 6: # add processed camera sensor noise if random.random() < isp_prob and isp_model is not None: with torch.no_grad(): img, hq = isp_model.forward(img.copy(), hq) # add final JPEG compression noise img = add_JPEG_noise(img) # random crop img, hq = random_crop(img, hq, sf_ori, lq_patchsize) return img, hq # todo no isp_model? def degradation_bsrgan_variant(image, sf=4, isp_model=None): """ This is the degradation model of BSRGAN from the paper "Designing a Practical Degradation Model for Deep Blind Image Super-Resolution" ---------- sf: scale factor isp_model: camera ISP model Returns ------- img: low-quality patch, size: lq_patchsizeXlq_patchsizeXC, range: [0, 1] hq: corresponding high-quality patch, size: (lq_patchsizexsf)X(lq_patchsizexsf)XC, range: [0, 1] """ image = util.uint2single(image) isp_prob, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25 sf_ori = sf h1, w1 = image.shape[:2] image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop h, w = image.shape[:2] hq = image.copy() if sf == 4 and random.random() < scale2_prob: # downsample1 if np.random.rand() < 0.5: image = cv2.resize(image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])), interpolation=random.choice([1, 2, 3])) else: image = util.imresize_np(image, 1 / 2, True) image = np.clip(image, 0.0, 1.0) sf = 2 shuffle_order = random.sample(range(7), 7) idx1, idx2 = shuffle_order.index(2), shuffle_order.index(3) if idx1 > idx2: # keep downsample3 last shuffle_order[idx1], shuffle_order[idx2] = shuffle_order[idx2], shuffle_order[idx1] for i in shuffle_order: if i == 0: image = add_blur(image, sf=sf) # elif i == 1: # image = add_blur(image, sf=sf) if i == 0: pass elif i == 2: a, b = image.shape[1], image.shape[0] # downsample2 if random.random() < 0.8: sf1 = random.uniform(1, 2 * sf) image = cv2.resize(image, (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])), interpolation=random.choice([1, 2, 3])) else: k = fspecial('gaussian', 25, random.uniform(0.1, 0.6 * sf)) k_shifted = shift_pixel(k, sf) k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode='mirror') image = image[0::sf, 0::sf, ...] # nearest downsampling image = np.clip(image, 0.0, 1.0) elif i == 3: # downsample3 image = cv2.resize(image, (int(1 / sf * a), int(1 / sf * b)), interpolation=random.choice([1, 2, 3])) image = np.clip(image, 0.0, 1.0) elif i == 4: # add Gaussian noise image = add_Gaussian_noise(image, noise_level1=1, noise_level2=2) elif i == 5: # add JPEG noise if random.random() < jpeg_prob: image = add_JPEG_noise(image) # # elif i == 6: # # add processed camera sensor noise # if random.random() < isp_prob and isp_model is not None: # with torch.no_grad(): # img, hq = isp_model.forward(img.copy(), hq) # add final JPEG compression noise image = add_JPEG_noise(image) image = util.single2uint(image) example = {"image": image} return example if __name__ == '__main__': print("hey") img = util.imread_uint('utils/test.png', 3) img = img[:448, :448] h = img.shape[0] // 4 print("resizing to", h) sf = 4 deg_fn = partial(degradation_bsrgan_variant, sf=sf) for i in range(20): print(i) img_hq = img img_lq = deg_fn(img)["image"] img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq) print(img_lq) img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"] print(img_lq.shape) print("bicubic", img_lq_bicubic.shape) print(img_hq.shape) lq_nearest = cv2.resize(util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), interpolation=0) lq_bicubic_nearest = cv2.resize(util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), interpolation=0) img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1) util.imsave(img_concat, str(i) + '.png') ================================================ FILE: ldm/modules/image_degradation/utils_image.py ================================================ import os import math import random import numpy as np import torch import cv2 from torchvision.utils import make_grid from datetime import datetime #import matplotlib.pyplot as plt # TODO: check with Dominik, also bsrgan.py vs bsrgan_light.py os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE" ''' # -------------------------------------------- # Kai Zhang (github: https://github.com/cszn) # 03/Mar/2019 # -------------------------------------------- # https://github.com/twhui/SRGAN-pyTorch # https://github.com/xinntao/BasicSR # -------------------------------------------- ''' IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.tif'] def is_image_file(filename): return any(filename.endswith(extension) for extension in IMG_EXTENSIONS) def get_timestamp(): return datetime.now().strftime('%y%m%d-%H%M%S') def imshow(x, title=None, cbar=False, figsize=None): plt.figure(figsize=figsize) plt.imshow(np.squeeze(x), interpolation='nearest', cmap='gray') if title: plt.title(title) if cbar: plt.colorbar() plt.show() def surf(Z, cmap='rainbow', figsize=None): plt.figure(figsize=figsize) ax3 = plt.axes(projection='3d') w, h = Z.shape[:2] xx = np.arange(0,w,1) yy = np.arange(0,h,1) X, Y = np.meshgrid(xx, yy) ax3.plot_surface(X,Y,Z,cmap=cmap) #ax3.contour(X,Y,Z, zdim='z',offset=-2,cmap=cmap) plt.show() ''' # -------------------------------------------- # get image pathes # -------------------------------------------- ''' def get_image_paths(dataroot): paths = None # return None if dataroot is None if dataroot is not None: paths = sorted(_get_paths_from_images(dataroot)) return paths def _get_paths_from_images(path): assert os.path.isdir(path), '{:s} is not a valid directory'.format(path) images = [] for dirpath, _, fnames in sorted(os.walk(path)): for fname in sorted(fnames): if is_image_file(fname): img_path = os.path.join(dirpath, fname) images.append(img_path) assert images, '{:s} has no valid image file'.format(path) return images ''' # -------------------------------------------- # split large images into small images # -------------------------------------------- ''' def patches_from_image(img, p_size=512, p_overlap=64, p_max=800): w, h = img.shape[:2] patches = [] if w > p_max and h > p_max: w1 = list(np.arange(0, w-p_size, p_size-p_overlap, dtype=np.int)) h1 = list(np.arange(0, h-p_size, p_size-p_overlap, dtype=np.int)) w1.append(w-p_size) h1.append(h-p_size) # print(w1) # print(h1) for i in w1: for j in h1: patches.append(img[i:i+p_size, j:j+p_size,:]) else: patches.append(img) return patches def imssave(imgs, img_path): """ imgs: list, N images of size WxHxC """ img_name, ext = os.path.splitext(os.path.basename(img_path)) for i, img in enumerate(imgs): if img.ndim == 3: img = img[:, :, [2, 1, 0]] new_path = os.path.join(os.path.dirname(img_path), img_name+str('_s{:04d}'.format(i))+'.png') cv2.imwrite(new_path, img) def split_imageset(original_dataroot, taget_dataroot, n_channels=3, p_size=800, p_overlap=96, p_max=1000): """ split the large images from original_dataroot into small overlapped images with size (p_size)x(p_size), and save them into taget_dataroot; only the images with larger size than (p_max)x(p_max) will be splitted. Args: original_dataroot: taget_dataroot: p_size: size of small images p_overlap: patch size in training is a good choice p_max: images with smaller size than (p_max)x(p_max) keep unchanged. """ paths = get_image_paths(original_dataroot) for img_path in paths: # img_name, ext = os.path.splitext(os.path.basename(img_path)) img = imread_uint(img_path, n_channels=n_channels) patches = patches_from_image(img, p_size, p_overlap, p_max) imssave(patches, os.path.join(taget_dataroot,os.path.basename(img_path))) #if original_dataroot == taget_dataroot: #del img_path ''' # -------------------------------------------- # makedir # -------------------------------------------- ''' def mkdir(path): if not os.path.exists(path): os.makedirs(path) def mkdirs(paths): if isinstance(paths, str): mkdir(paths) else: for path in paths: mkdir(path) def mkdir_and_rename(path): if os.path.exists(path): new_name = path + '_archived_' + get_timestamp() print('Path already exists. Rename it to [{:s}]'.format(new_name)) os.rename(path, new_name) os.makedirs(path) ''' # -------------------------------------------- # read image from path # opencv is fast, but read BGR numpy image # -------------------------------------------- ''' # -------------------------------------------- # get uint8 image of size HxWxn_channles (RGB) # -------------------------------------------- def imread_uint(path, n_channels=3): # input: path # output: HxWx3(RGB or GGG), or HxWx1 (G) if n_channels == 1: img = cv2.imread(path, 0) # cv2.IMREAD_GRAYSCALE img = np.expand_dims(img, axis=2) # HxWx1 elif n_channels == 3: img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # BGR or G if img.ndim == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) # GGG else: img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # RGB return img # -------------------------------------------- # matlab's imwrite # -------------------------------------------- def imsave(img, img_path): img = np.squeeze(img) if img.ndim == 3: img = img[:, :, [2, 1, 0]] cv2.imwrite(img_path, img) def imwrite(img, img_path): img = np.squeeze(img) if img.ndim == 3: img = img[:, :, [2, 1, 0]] cv2.imwrite(img_path, img) # -------------------------------------------- # get single image of size HxWxn_channles (BGR) # -------------------------------------------- def read_img(path): # read image by cv2 # return: Numpy float32, HWC, BGR, [0,1] img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # cv2.IMREAD_GRAYSCALE img = img.astype(np.float32) / 255. if img.ndim == 2: img = np.expand_dims(img, axis=2) # some images have 4 channels if img.shape[2] > 3: img = img[:, :, :3] return img ''' # -------------------------------------------- # image format conversion # -------------------------------------------- # numpy(single) <---> numpy(unit) # numpy(single) <---> tensor # numpy(unit) <---> tensor # -------------------------------------------- ''' # -------------------------------------------- # numpy(single) [0, 1] <---> numpy(unit) # -------------------------------------------- def uint2single(img): return np.float32(img/255.) def single2uint(img): return np.uint8((img.clip(0, 1)*255.).round()) def uint162single(img): return np.float32(img/65535.) def single2uint16(img): return np.uint16((img.clip(0, 1)*65535.).round()) # -------------------------------------------- # numpy(unit) (HxWxC or HxW) <---> tensor # -------------------------------------------- # convert uint to 4-dimensional torch tensor def uint2tensor4(img): if img.ndim == 2: img = np.expand_dims(img, axis=2) return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.).unsqueeze(0) # convert uint to 3-dimensional torch tensor def uint2tensor3(img): if img.ndim == 2: img = np.expand_dims(img, axis=2) return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().div(255.) # convert 2/3/4-dimensional torch tensor to uint def tensor2uint(img): img = img.data.squeeze().float().clamp_(0, 1).cpu().numpy() if img.ndim == 3: img = np.transpose(img, (1, 2, 0)) return np.uint8((img*255.0).round()) # -------------------------------------------- # numpy(single) (HxWxC) <---> tensor # -------------------------------------------- # convert single (HxWxC) to 3-dimensional torch tensor def single2tensor3(img): return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float() # convert single (HxWxC) to 4-dimensional torch tensor def single2tensor4(img): return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1).float().unsqueeze(0) # convert torch tensor to single def tensor2single(img): img = img.data.squeeze().float().cpu().numpy() if img.ndim == 3: img = np.transpose(img, (1, 2, 0)) return img # convert torch tensor to single def tensor2single3(img): img = img.data.squeeze().float().cpu().numpy() if img.ndim == 3: img = np.transpose(img, (1, 2, 0)) elif img.ndim == 2: img = np.expand_dims(img, axis=2) return img def single2tensor5(img): return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float().unsqueeze(0) def single32tensor5(img): return torch.from_numpy(np.ascontiguousarray(img)).float().unsqueeze(0).unsqueeze(0) def single42tensor4(img): return torch.from_numpy(np.ascontiguousarray(img)).permute(2, 0, 1, 3).float() # from skimage.io import imread, imsave def tensor2img(tensor, out_type=np.uint8, min_max=(0, 1)): ''' Converts a torch Tensor into an image Numpy array of BGR channel order Input: 4D(B,(3/1),H,W), 3D(C,H,W), or 2D(H,W), any range, RGB channel order Output: 3D(H,W,C) or 2D(H,W), [0,255], np.uint8 (default) ''' tensor = tensor.squeeze().float().cpu().clamp_(*min_max) # squeeze first, then clamp tensor = (tensor - min_max[0]) / (min_max[1] - min_max[0]) # to range [0,1] n_dim = tensor.dim() if n_dim == 4: n_img = len(tensor) img_np = make_grid(tensor, nrow=int(math.sqrt(n_img)), normalize=False).numpy() img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0)) # HWC, BGR elif n_dim == 3: img_np = tensor.numpy() img_np = np.transpose(img_np[[2, 1, 0], :, :], (1, 2, 0)) # HWC, BGR elif n_dim == 2: img_np = tensor.numpy() else: raise TypeError( 'Only support 4D, 3D and 2D tensor. But received with dimension: {:d}'.format(n_dim)) if out_type == np.uint8: img_np = (img_np * 255.0).round() # Important. Unlike matlab, numpy.unit8() WILL NOT round by default. return img_np.astype(out_type) ''' # -------------------------------------------- # Augmentation, flipe and/or rotate # -------------------------------------------- # The following two are enough. # (1) augmet_img: numpy image of WxHxC or WxH # (2) augment_img_tensor4: tensor image 1xCxWxH # -------------------------------------------- ''' def augment_img(img, mode=0): '''Kai Zhang (github: https://github.com/cszn) ''' if mode == 0: return img elif mode == 1: return np.flipud(np.rot90(img)) elif mode == 2: return np.flipud(img) elif mode == 3: return np.rot90(img, k=3) elif mode == 4: return np.flipud(np.rot90(img, k=2)) elif mode == 5: return np.rot90(img) elif mode == 6: return np.rot90(img, k=2) elif mode == 7: return np.flipud(np.rot90(img, k=3)) def augment_img_tensor4(img, mode=0): '''Kai Zhang (github: https://github.com/cszn) ''' if mode == 0: return img elif mode == 1: return img.rot90(1, [2, 3]).flip([2]) elif mode == 2: return img.flip([2]) elif mode == 3: return img.rot90(3, [2, 3]) elif mode == 4: return img.rot90(2, [2, 3]).flip([2]) elif mode == 5: return img.rot90(1, [2, 3]) elif mode == 6: return img.rot90(2, [2, 3]) elif mode == 7: return img.rot90(3, [2, 3]).flip([2]) def augment_img_tensor(img, mode=0): '''Kai Zhang (github: https://github.com/cszn) ''' img_size = img.size() img_np = img.data.cpu().numpy() if len(img_size) == 3: img_np = np.transpose(img_np, (1, 2, 0)) elif len(img_size) == 4: img_np = np.transpose(img_np, (2, 3, 1, 0)) img_np = augment_img(img_np, mode=mode) img_tensor = torch.from_numpy(np.ascontiguousarray(img_np)) if len(img_size) == 3: img_tensor = img_tensor.permute(2, 0, 1) elif len(img_size) == 4: img_tensor = img_tensor.permute(3, 2, 0, 1) return img_tensor.type_as(img) def augment_img_np3(img, mode=0): if mode == 0: return img elif mode == 1: return img.transpose(1, 0, 2) elif mode == 2: return img[::-1, :, :] elif mode == 3: img = img[::-1, :, :] img = img.transpose(1, 0, 2) return img elif mode == 4: return img[:, ::-1, :] elif mode == 5: img = img[:, ::-1, :] img = img.transpose(1, 0, 2) return img elif mode == 6: img = img[:, ::-1, :] img = img[::-1, :, :] return img elif mode == 7: img = img[:, ::-1, :] img = img[::-1, :, :] img = img.transpose(1, 0, 2) return img def augment_imgs(img_list, hflip=True, rot=True): # horizontal flip OR rotate hflip = hflip and random.random() < 0.5 vflip = rot and random.random() < 0.5 rot90 = rot and random.random() < 0.5 def _augment(img): if hflip: img = img[:, ::-1, :] if vflip: img = img[::-1, :, :] if rot90: img = img.transpose(1, 0, 2) return img return [_augment(img) for img in img_list] ''' # -------------------------------------------- # modcrop and shave # -------------------------------------------- ''' def modcrop(img_in, scale): # img_in: Numpy, HWC or HW img = np.copy(img_in) if img.ndim == 2: H, W = img.shape H_r, W_r = H % scale, W % scale img = img[:H - H_r, :W - W_r] elif img.ndim == 3: H, W, C = img.shape H_r, W_r = H % scale, W % scale img = img[:H - H_r, :W - W_r, :] else: raise ValueError('Wrong img ndim: [{:d}].'.format(img.ndim)) return img def shave(img_in, border=0): # img_in: Numpy, HWC or HW img = np.copy(img_in) h, w = img.shape[:2] img = img[border:h-border, border:w-border] return img ''' # -------------------------------------------- # image processing process on numpy image # channel_convert(in_c, tar_type, img_list): # rgb2ycbcr(img, only_y=True): # bgr2ycbcr(img, only_y=True): # ycbcr2rgb(img): # -------------------------------------------- ''' def rgb2ycbcr(img, only_y=True): '''same as matlab rgb2ycbcr only_y: only return Y channel Input: uint8, [0, 255] float, [0, 1] ''' in_img_type = img.dtype img.astype(np.float32) if in_img_type != np.uint8: img *= 255. # convert if only_y: rlt = np.dot(img, [65.481, 128.553, 24.966]) / 255.0 + 16.0 else: rlt = np.matmul(img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786], [24.966, 112.0, -18.214]]) / 255.0 + [16, 128, 128] if in_img_type == np.uint8: rlt = rlt.round() else: rlt /= 255. return rlt.astype(in_img_type) def ycbcr2rgb(img): '''same as matlab ycbcr2rgb Input: uint8, [0, 255] float, [0, 1] ''' in_img_type = img.dtype img.astype(np.float32) if in_img_type != np.uint8: img *= 255. # convert rlt = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071], [0.00625893, -0.00318811, 0]]) * 255.0 + [-222.921, 135.576, -276.836] if in_img_type == np.uint8: rlt = rlt.round() else: rlt /= 255. return rlt.astype(in_img_type) def bgr2ycbcr(img, only_y=True): '''bgr version of rgb2ycbcr only_y: only return Y channel Input: uint8, [0, 255] float, [0, 1] ''' in_img_type = img.dtype img.astype(np.float32) if in_img_type != np.uint8: img *= 255. # convert if only_y: rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0 else: rlt = np.matmul(img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786], [65.481, -37.797, 112.0]]) / 255.0 + [16, 128, 128] if in_img_type == np.uint8: rlt = rlt.round() else: rlt /= 255. return rlt.astype(in_img_type) def channel_convert(in_c, tar_type, img_list): # conversion among BGR, gray and y if in_c == 3 and tar_type == 'gray': # BGR to gray gray_list = [cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) for img in img_list] return [np.expand_dims(img, axis=2) for img in gray_list] elif in_c == 3 and tar_type == 'y': # BGR to y y_list = [bgr2ycbcr(img, only_y=True) for img in img_list] return [np.expand_dims(img, axis=2) for img in y_list] elif in_c == 1 and tar_type == 'RGB': # gray/y to BGR return [cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) for img in img_list] else: return img_list ''' # -------------------------------------------- # metric, PSNR and SSIM # -------------------------------------------- ''' # -------------------------------------------- # PSNR # -------------------------------------------- def calculate_psnr(img1, img2, border=0): # img1 and img2 have range [0, 255] #img1 = img1.squeeze() #img2 = img2.squeeze() if not img1.shape == img2.shape: raise ValueError('Input images must have the same dimensions.') h, w = img1.shape[:2] img1 = img1[border:h-border, border:w-border] img2 = img2[border:h-border, border:w-border] img1 = img1.astype(np.float64) img2 = img2.astype(np.float64) mse = np.mean((img1 - img2)**2) if mse == 0: return float('inf') return 20 * math.log10(255.0 / math.sqrt(mse)) # -------------------------------------------- # SSIM # -------------------------------------------- def calculate_ssim(img1, img2, border=0): '''calculate SSIM the same outputs as MATLAB's img1, img2: [0, 255] ''' #img1 = img1.squeeze() #img2 = img2.squeeze() if not img1.shape == img2.shape: raise ValueError('Input images must have the same dimensions.') h, w = img1.shape[:2] img1 = img1[border:h-border, border:w-border] img2 = img2[border:h-border, border:w-border] if img1.ndim == 2: return ssim(img1, img2) elif img1.ndim == 3: if img1.shape[2] == 3: ssims = [] for i in range(3): ssims.append(ssim(img1[:,:,i], img2[:,:,i])) return np.array(ssims).mean() elif img1.shape[2] == 1: return ssim(np.squeeze(img1), np.squeeze(img2)) else: raise ValueError('Wrong input image dimensions.') def ssim(img1, img2): C1 = (0.01 * 255)**2 C2 = (0.03 * 255)**2 img1 = img1.astype(np.float64) img2 = img2.astype(np.float64) kernel = cv2.getGaussianKernel(11, 1.5) window = np.outer(kernel, kernel.transpose()) mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5] # valid mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5] mu1_sq = mu1**2 mu2_sq = mu2**2 mu1_mu2 = mu1 * mu2 sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2 ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) return ssim_map.mean() ''' # -------------------------------------------- # matlab's bicubic imresize (numpy and torch) [0, 1] # -------------------------------------------- ''' # matlab 'imresize' function, now only support 'bicubic' def cubic(x): absx = torch.abs(x) absx2 = absx**2 absx3 = absx**3 return (1.5*absx3 - 2.5*absx2 + 1) * ((absx <= 1).type_as(absx)) + \ (-0.5*absx3 + 2.5*absx2 - 4*absx + 2) * (((absx > 1)*(absx <= 2)).type_as(absx)) def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing): if (scale < 1) and (antialiasing): # Use a modified kernel to simultaneously interpolate and antialias- larger kernel width kernel_width = kernel_width / scale # Output-space coordinates x = torch.linspace(1, out_length, out_length) # Input-space coordinates. Calculate the inverse mapping such that 0.5 # in output space maps to 0.5 in input space, and 0.5+scale in output # space maps to 1.5 in input space. u = x / scale + 0.5 * (1 - 1 / scale) # What is the left-most pixel that can be involved in the computation? left = torch.floor(u - kernel_width / 2) # What is the maximum number of pixels that can be involved in the # computation? Note: it's OK to use an extra pixel here; if the # corresponding weights are all zero, it will be eliminated at the end # of this function. P = math.ceil(kernel_width) + 2 # The indices of the input pixels involved in computing the k-th output # pixel are in row k of the indices matrix. indices = left.view(out_length, 1).expand(out_length, P) + torch.linspace(0, P - 1, P).view( 1, P).expand(out_length, P) # The weights used to compute the k-th output pixel are in row k of the # weights matrix. distance_to_center = u.view(out_length, 1).expand(out_length, P) - indices # apply cubic kernel if (scale < 1) and (antialiasing): weights = scale * cubic(distance_to_center * scale) else: weights = cubic(distance_to_center) # Normalize the weights matrix so that each row sums to 1. weights_sum = torch.sum(weights, 1).view(out_length, 1) weights = weights / weights_sum.expand(out_length, P) # If a column in weights is all zero, get rid of it. only consider the first and last column. weights_zero_tmp = torch.sum((weights == 0), 0) if not math.isclose(weights_zero_tmp[0], 0, rel_tol=1e-6): indices = indices.narrow(1, 1, P - 2) weights = weights.narrow(1, 1, P - 2) if not math.isclose(weights_zero_tmp[-1], 0, rel_tol=1e-6): indices = indices.narrow(1, 0, P - 2) weights = weights.narrow(1, 0, P - 2) weights = weights.contiguous() indices = indices.contiguous() sym_len_s = -indices.min() + 1 sym_len_e = indices.max() - in_length indices = indices + sym_len_s - 1 return weights, indices, int(sym_len_s), int(sym_len_e) # -------------------------------------------- # imresize for tensor image [0, 1] # -------------------------------------------- def imresize(img, scale, antialiasing=True): # Now the scale should be the same for H and W # input: img: pytorch tensor, CHW or HW [0,1] # output: CHW or HW [0,1] w/o round need_squeeze = True if img.dim() == 2 else False if need_squeeze: img.unsqueeze_(0) in_C, in_H, in_W = img.size() out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale) kernel_width = 4 kernel = 'cubic' # Return the desired dimension order for performing the resize. The # strategy is to perform the resize first along the dimension with the # smallest scale factor. # Now we do not support this. # get weights and indices weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices( in_H, out_H, scale, kernel, kernel_width, antialiasing) weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices( in_W, out_W, scale, kernel, kernel_width, antialiasing) # process H dimension # symmetric copying img_aug = torch.FloatTensor(in_C, in_H + sym_len_Hs + sym_len_He, in_W) img_aug.narrow(1, sym_len_Hs, in_H).copy_(img) sym_patch = img[:, :sym_len_Hs, :] inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(1, inv_idx) img_aug.narrow(1, 0, sym_len_Hs).copy_(sym_patch_inv) sym_patch = img[:, -sym_len_He:, :] inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(1, inv_idx) img_aug.narrow(1, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv) out_1 = torch.FloatTensor(in_C, out_H, in_W) kernel_width = weights_H.size(1) for i in range(out_H): idx = int(indices_H[i][0]) for j in range(out_C): out_1[j, i, :] = img_aug[j, idx:idx + kernel_width, :].transpose(0, 1).mv(weights_H[i]) # process W dimension # symmetric copying out_1_aug = torch.FloatTensor(in_C, out_H, in_W + sym_len_Ws + sym_len_We) out_1_aug.narrow(2, sym_len_Ws, in_W).copy_(out_1) sym_patch = out_1[:, :, :sym_len_Ws] inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(2, inv_idx) out_1_aug.narrow(2, 0, sym_len_Ws).copy_(sym_patch_inv) sym_patch = out_1[:, :, -sym_len_We:] inv_idx = torch.arange(sym_patch.size(2) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(2, inv_idx) out_1_aug.narrow(2, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv) out_2 = torch.FloatTensor(in_C, out_H, out_W) kernel_width = weights_W.size(1) for i in range(out_W): idx = int(indices_W[i][0]) for j in range(out_C): out_2[j, :, i] = out_1_aug[j, :, idx:idx + kernel_width].mv(weights_W[i]) if need_squeeze: out_2.squeeze_() return out_2 # -------------------------------------------- # imresize for numpy image [0, 1] # -------------------------------------------- def imresize_np(img, scale, antialiasing=True): # Now the scale should be the same for H and W # input: img: Numpy, HWC or HW [0,1] # output: HWC or HW [0,1] w/o round img = torch.from_numpy(img) need_squeeze = True if img.dim() == 2 else False if need_squeeze: img.unsqueeze_(2) in_H, in_W, in_C = img.size() out_C, out_H, out_W = in_C, math.ceil(in_H * scale), math.ceil(in_W * scale) kernel_width = 4 kernel = 'cubic' # Return the desired dimension order for performing the resize. The # strategy is to perform the resize first along the dimension with the # smallest scale factor. # Now we do not support this. # get weights and indices weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices( in_H, out_H, scale, kernel, kernel_width, antialiasing) weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices( in_W, out_W, scale, kernel, kernel_width, antialiasing) # process H dimension # symmetric copying img_aug = torch.FloatTensor(in_H + sym_len_Hs + sym_len_He, in_W, in_C) img_aug.narrow(0, sym_len_Hs, in_H).copy_(img) sym_patch = img[:sym_len_Hs, :, :] inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(0, inv_idx) img_aug.narrow(0, 0, sym_len_Hs).copy_(sym_patch_inv) sym_patch = img[-sym_len_He:, :, :] inv_idx = torch.arange(sym_patch.size(0) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(0, inv_idx) img_aug.narrow(0, sym_len_Hs + in_H, sym_len_He).copy_(sym_patch_inv) out_1 = torch.FloatTensor(out_H, in_W, in_C) kernel_width = weights_H.size(1) for i in range(out_H): idx = int(indices_H[i][0]) for j in range(out_C): out_1[i, :, j] = img_aug[idx:idx + kernel_width, :, j].transpose(0, 1).mv(weights_H[i]) # process W dimension # symmetric copying out_1_aug = torch.FloatTensor(out_H, in_W + sym_len_Ws + sym_len_We, in_C) out_1_aug.narrow(1, sym_len_Ws, in_W).copy_(out_1) sym_patch = out_1[:, :sym_len_Ws, :] inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(1, inv_idx) out_1_aug.narrow(1, 0, sym_len_Ws).copy_(sym_patch_inv) sym_patch = out_1[:, -sym_len_We:, :] inv_idx = torch.arange(sym_patch.size(1) - 1, -1, -1).long() sym_patch_inv = sym_patch.index_select(1, inv_idx) out_1_aug.narrow(1, sym_len_Ws + in_W, sym_len_We).copy_(sym_patch_inv) out_2 = torch.FloatTensor(out_H, out_W, in_C) kernel_width = weights_W.size(1) for i in range(out_W): idx = int(indices_W[i][0]) for j in range(out_C): out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv(weights_W[i]) if need_squeeze: out_2.squeeze_() return out_2.numpy() if __name__ == '__main__': print('---') # img = imread_uint('test.bmp', 3) # img = uint2single(img) # img_bicubic = imresize_np(img, 1/4) ================================================ FILE: ldm/modules/losses/__init__.py ================================================ from ldm.modules.losses.contperceptual import LPIPSWithDiscriminator ================================================ FILE: ldm/modules/losses/contperceptual.py ================================================ import torch import torch.nn as nn from taming.modules.losses.vqperceptual import * # TODO: taming dependency yes/no? class LPIPSWithDiscriminator(nn.Module): def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixelloss_weight=1.0, disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0, perceptual_weight=1.0, use_actnorm=False, disc_conditional=False, disc_loss="hinge"): super().__init__() assert disc_loss in ["hinge", "vanilla"] self.kl_weight = kl_weight self.pixel_weight = pixelloss_weight self.perceptual_loss = LPIPS().eval() self.perceptual_weight = perceptual_weight # output log variance self.logvar = nn.Parameter(torch.ones(size=()) * logvar_init) self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=use_actnorm ).apply(weights_init) self.discriminator_iter_start = disc_start self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss self.disc_factor = disc_factor self.discriminator_weight = disc_weight self.disc_conditional = disc_conditional def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None): if last_layer is not None: nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0] g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0] else: nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0] g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0] d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4) d_weight = torch.clamp(d_weight, 0.0, 1e4).detach() d_weight = d_weight * self.discriminator_weight return d_weight def forward(self, inputs, reconstructions, posteriors, optimizer_idx, global_step, last_layer=None, cond=None, split="train", weights=None): rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous()) if self.perceptual_weight > 0: p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous()) rec_loss = rec_loss + self.perceptual_weight * p_loss nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar weighted_nll_loss = nll_loss if weights is not None: weighted_nll_loss = weights*nll_loss weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0] nll_loss = torch.sum(nll_loss) / nll_loss.shape[0] kl_loss = posteriors.kl() kl_loss = torch.sum(kl_loss) / kl_loss.shape[0] # now the GAN part if optimizer_idx == 0: # generator update if cond is None: assert not self.disc_conditional logits_fake = self.discriminator(reconstructions.contiguous()) else: assert self.disc_conditional logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1)) g_loss = -torch.mean(logits_fake) if self.disc_factor > 0.0: try: d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer) except RuntimeError: assert not self.training d_weight = torch.tensor(0.0) else: d_weight = torch.tensor(0.0) disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/logvar".format(split): self.logvar.detach(), "{}/kl_loss".format(split): kl_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(), "{}/rec_loss".format(split): rec_loss.detach().mean(), "{}/d_weight".format(split): d_weight.detach(), "{}/disc_factor".format(split): torch.tensor(disc_factor), "{}/g_loss".format(split): g_loss.detach().mean(), } return loss, log if optimizer_idx == 1: # second pass for discriminator update if cond is None: logits_real = self.discriminator(inputs.contiguous().detach()) logits_fake = self.discriminator(reconstructions.contiguous().detach()) else: logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1)) logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1)) disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) d_loss = disc_factor * self.disc_loss(logits_real, logits_fake) log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(), "{}/logits_real".format(split): logits_real.detach().mean(), "{}/logits_fake".format(split): logits_fake.detach().mean() } return d_loss, log ================================================ FILE: ldm/modules/losses/vqperceptual.py ================================================ import torch from torch import nn import torch.nn.functional as F from einops import repeat from taming.modules.discriminator.model import NLayerDiscriminator, weights_init from taming.modules.losses.lpips import LPIPS from taming.modules.losses.vqperceptual import hinge_d_loss, vanilla_d_loss def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights): assert weights.shape[0] == logits_real.shape[0] == logits_fake.shape[0] loss_real = torch.mean(F.relu(1. - logits_real), dim=[1,2,3]) loss_fake = torch.mean(F.relu(1. + logits_fake), dim=[1,2,3]) loss_real = (weights * loss_real).sum() / weights.sum() loss_fake = (weights * loss_fake).sum() / weights.sum() d_loss = 0.5 * (loss_real + loss_fake) return d_loss def adopt_weight(weight, global_step, threshold=0, value=0.): if global_step < threshold: weight = value return weight def measure_perplexity(predicted_indices, n_embed): # src: https://github.com/karpathy/deep-vector-quantization/blob/main/model.py # eval cluster perplexity. when perplexity == num_embeddings then all clusters are used exactly equally encodings = F.one_hot(predicted_indices, n_embed).float().reshape(-1, n_embed) avg_probs = encodings.mean(0) perplexity = (-(avg_probs * torch.log(avg_probs + 1e-10)).sum()).exp() cluster_use = torch.sum(avg_probs > 0) return perplexity, cluster_use def l1(x, y): return torch.abs(x-y) def l2(x, y): return torch.pow((x-y), 2) class VQLPIPSWithDiscriminator(nn.Module): def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0, disc_num_layers=3, disc_in_channels=3, disc_factor=1.0, disc_weight=1.0, perceptual_weight=1.0, use_actnorm=False, disc_conditional=False, disc_ndf=64, disc_loss="hinge", n_classes=None, perceptual_loss="lpips", pixel_loss="l1"): super().__init__() assert disc_loss in ["hinge", "vanilla"] assert perceptual_loss in ["lpips", "clips", "dists"] assert pixel_loss in ["l1", "l2"] self.codebook_weight = codebook_weight self.pixel_weight = pixelloss_weight if perceptual_loss == "lpips": print(f"{self.__class__.__name__}: Running with LPIPS.") self.perceptual_loss = LPIPS().eval() else: raise ValueError(f"Unknown perceptual loss: >> {perceptual_loss} <<") self.perceptual_weight = perceptual_weight if pixel_loss == "l1": self.pixel_loss = l1 else: self.pixel_loss = l2 self.discriminator = NLayerDiscriminator(input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=use_actnorm, ndf=disc_ndf ).apply(weights_init) self.discriminator_iter_start = disc_start if disc_loss == "hinge": self.disc_loss = hinge_d_loss elif disc_loss == "vanilla": self.disc_loss = vanilla_d_loss else: raise ValueError(f"Unknown GAN loss '{disc_loss}'.") print(f"VQLPIPSWithDiscriminator running with {disc_loss} loss.") self.disc_factor = disc_factor self.discriminator_weight = disc_weight self.disc_conditional = disc_conditional self.n_classes = n_classes def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None): if last_layer is not None: nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0] g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0] else: nll_grads = torch.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0] g_grads = torch.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0] d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4) d_weight = torch.clamp(d_weight, 0.0, 1e4).detach() d_weight = d_weight * self.discriminator_weight return d_weight def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx, global_step, last_layer=None, cond=None, split="train", predicted_indices=None): if not exists(codebook_loss): codebook_loss = torch.tensor([0.]).to(inputs.device) #rec_loss = torch.abs(inputs.contiguous() - reconstructions.contiguous()) rec_loss = self.pixel_loss(inputs.contiguous(), reconstructions.contiguous()) if self.perceptual_weight > 0: p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous()) rec_loss = rec_loss + self.perceptual_weight * p_loss else: p_loss = torch.tensor([0.0]) nll_loss = rec_loss #nll_loss = torch.sum(nll_loss) / nll_loss.shape[0] nll_loss = torch.mean(nll_loss) # now the GAN part if optimizer_idx == 0: # generator update if cond is None: assert not self.disc_conditional logits_fake = self.discriminator(reconstructions.contiguous()) else: assert self.disc_conditional logits_fake = self.discriminator(torch.cat((reconstructions.contiguous(), cond), dim=1)) g_loss = -torch.mean(logits_fake) try: d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer) except RuntimeError: assert not self.training d_weight = torch.tensor(0.0) disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) loss = nll_loss + d_weight * disc_factor * g_loss + self.codebook_weight * codebook_loss.mean() log = {"{}/total_loss".format(split): loss.clone().detach().mean(), "{}/quant_loss".format(split): codebook_loss.detach().mean(), "{}/nll_loss".format(split): nll_loss.detach().mean(), "{}/rec_loss".format(split): rec_loss.detach().mean(), "{}/p_loss".format(split): p_loss.detach().mean(), "{}/d_weight".format(split): d_weight.detach(), "{}/disc_factor".format(split): torch.tensor(disc_factor), "{}/g_loss".format(split): g_loss.detach().mean(), } if predicted_indices is not None: assert self.n_classes is not None with torch.no_grad(): perplexity, cluster_usage = measure_perplexity(predicted_indices, self.n_classes) log[f"{split}/perplexity"] = perplexity log[f"{split}/cluster_usage"] = cluster_usage return loss, log if optimizer_idx == 1: # second pass for discriminator update if cond is None: logits_real = self.discriminator(inputs.contiguous().detach()) logits_fake = self.discriminator(reconstructions.contiguous().detach()) else: logits_real = self.discriminator(torch.cat((inputs.contiguous().detach(), cond), dim=1)) logits_fake = self.discriminator(torch.cat((reconstructions.contiguous().detach(), cond), dim=1)) disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) d_loss = disc_factor * self.disc_loss(logits_real, logits_fake) log = {"{}/disc_loss".format(split): d_loss.clone().detach().mean(), "{}/logits_real".format(split): logits_real.detach().mean(), "{}/logits_fake".format(split): logits_fake.detach().mean() } return d_loss, log ================================================ FILE: ldm/modules/x_transformer.py ================================================ """shout-out to https://github.com/lucidrains/x-transformers/tree/main/x_transformers""" import torch from torch import nn, einsum import torch.nn.functional as F from functools import partial from inspect import isfunction from collections import namedtuple from einops import rearrange, repeat, reduce # constants DEFAULT_DIM_HEAD = 64 Intermediates = namedtuple('Intermediates', [ 'pre_softmax_attn', 'post_softmax_attn' ]) LayerIntermediates = namedtuple('Intermediates', [ 'hiddens', 'attn_intermediates' ]) class AbsolutePositionalEmbedding(nn.Module): def __init__(self, dim, max_seq_len): super().__init__() self.emb = nn.Embedding(max_seq_len, dim) self.init_() def init_(self): nn.init.normal_(self.emb.weight, std=0.02) def forward(self, x): n = torch.arange(x.shape[1], device=x.device) return self.emb(n)[None, :, :] class FixedPositionalEmbedding(nn.Module): def __init__(self, dim): super().__init__() inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim)) self.register_buffer('inv_freq', inv_freq) def forward(self, x, seq_dim=1, offset=0): t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq) emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1) return emb[None, :, :] # helpers def exists(val): return val is not None def default(val, d): if exists(val): return val return d() if isfunction(d) else d def always(val): def inner(*args, **kwargs): return val return inner def not_equals(val): def inner(x): return x != val return inner def equals(val): def inner(x): return x == val return inner def max_neg_value(tensor): return -torch.finfo(tensor.dtype).max # keyword argument helpers def pick_and_pop(keys, d): values = list(map(lambda key: d.pop(key), keys)) return dict(zip(keys, values)) def group_dict_by_key(cond, d): return_val = [dict(), dict()] for key in d.keys(): match = bool(cond(key)) ind = int(not match) return_val[ind][key] = d[key] return (*return_val,) def string_begins_with(prefix, str): return str.startswith(prefix) def group_by_key_prefix(prefix, d): return group_dict_by_key(partial(string_begins_with, prefix), d) def groupby_prefix_and_trim(prefix, d): kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d) kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items()))) return kwargs_without_prefix, kwargs # classes class Scale(nn.Module): def __init__(self, value, fn): super().__init__() self.value = value self.fn = fn def forward(self, x, **kwargs): x, *rest = self.fn(x, **kwargs) return (x * self.value, *rest) class Rezero(nn.Module): def __init__(self, fn): super().__init__() self.fn = fn self.g = nn.Parameter(torch.zeros(1)) def forward(self, x, **kwargs): x, *rest = self.fn(x, **kwargs) return (x * self.g, *rest) class ScaleNorm(nn.Module): def __init__(self, dim, eps=1e-5): super().__init__() self.scale = dim ** -0.5 self.eps = eps self.g = nn.Parameter(torch.ones(1)) def forward(self, x): norm = torch.norm(x, dim=-1, keepdim=True) * self.scale return x / norm.clamp(min=self.eps) * self.g class RMSNorm(nn.Module): def __init__(self, dim, eps=1e-8): super().__init__() self.scale = dim ** -0.5 self.eps = eps self.g = nn.Parameter(torch.ones(dim)) def forward(self, x): norm = torch.norm(x, dim=-1, keepdim=True) * self.scale return x / norm.clamp(min=self.eps) * self.g class Residual(nn.Module): def forward(self, x, residual): return x + residual class GRUGating(nn.Module): def __init__(self, dim): super().__init__() self.gru = nn.GRUCell(dim, dim) def forward(self, x, residual): gated_output = self.gru( rearrange(x, 'b n d -> (b n) d'), rearrange(residual, 'b n d -> (b n) d') ) return gated_output.reshape_as(x) # feedforward class GEGLU(nn.Module): def __init__(self, dim_in, dim_out): super().__init__() self.proj = nn.Linear(dim_in, dim_out * 2) def forward(self, x): x, gate = self.proj(x).chunk(2, dim=-1) return x * F.gelu(gate) class FeedForward(nn.Module): def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.): super().__init__() inner_dim = int(dim * mult) dim_out = default(dim_out, dim) project_in = nn.Sequential( nn.Linear(dim, inner_dim), nn.GELU() ) if not glu else GEGLU(dim, inner_dim) self.net = nn.Sequential( project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out) ) def forward(self, x): return self.net(x) # attention. class Attention(nn.Module): def __init__( self, dim, dim_head=DEFAULT_DIM_HEAD, heads=8, causal=False, mask=None, talking_heads=False, sparse_topk=None, use_entmax15=False, num_mem_kv=0, dropout=0., on_attn=False ): super().__init__() if use_entmax15: raise NotImplementedError("Check out entmax activation instead of softmax activation!") self.scale = dim_head ** -0.5 self.heads = heads self.causal = causal self.mask = mask inner_dim = dim_head * heads self.to_q = nn.Linear(dim, inner_dim, bias=False) self.to_k = nn.Linear(dim, inner_dim, bias=False) self.to_v = nn.Linear(dim, inner_dim, bias=False) self.dropout = nn.Dropout(dropout) # talking heads self.talking_heads = talking_heads if talking_heads: self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads)) self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads)) # explicit topk sparse attention self.sparse_topk = sparse_topk # entmax #self.attn_fn = entmax15 if use_entmax15 else F.softmax self.attn_fn = F.softmax # add memory key / values self.num_mem_kv = num_mem_kv if num_mem_kv > 0: self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) # attention on attention self.attn_on_attn = on_attn self.to_out = nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim) def forward( self, x, context=None, mask=None, context_mask=None, rel_pos=None, sinusoidal_emb=None, prev_attn=None, mem=None ): b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device kv_input = default(context, x) q_input = x k_input = kv_input v_input = kv_input if exists(mem): k_input = torch.cat((mem, k_input), dim=-2) v_input = torch.cat((mem, v_input), dim=-2) if exists(sinusoidal_emb): # in shortformer, the query would start at a position offset depending on the past cached memory offset = k_input.shape[-2] - q_input.shape[-2] q_input = q_input + sinusoidal_emb(q_input, offset=offset) k_input = k_input + sinusoidal_emb(k_input) q = self.to_q(q_input) k = self.to_k(k_input) v = self.to_v(v_input) q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v)) input_mask = None if any(map(exists, (mask, context_mask))): q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool()) k_mask = q_mask if not exists(context) else context_mask k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool()) q_mask = rearrange(q_mask, 'b i -> b () i ()') k_mask = rearrange(k_mask, 'b j -> b () () j') input_mask = q_mask * k_mask if self.num_mem_kv > 0: mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v)) k = torch.cat((mem_k, k), dim=-2) v = torch.cat((mem_v, v), dim=-2) if exists(input_mask): input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True) dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale mask_value = max_neg_value(dots) if exists(prev_attn): dots = dots + prev_attn pre_softmax_attn = dots if talking_heads: dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous() if exists(rel_pos): dots = rel_pos(dots) if exists(input_mask): dots.masked_fill_(~input_mask, mask_value) del input_mask if self.causal: i, j = dots.shape[-2:] r = torch.arange(i, device=device) mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j') mask = F.pad(mask, (j - i, 0), value=False) dots.masked_fill_(mask, mask_value) del mask if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]: top, _ = dots.topk(self.sparse_topk, dim=-1) vk = top[..., -1].unsqueeze(-1).expand_as(dots) mask = dots < vk dots.masked_fill_(mask, mask_value) del mask attn = self.attn_fn(dots, dim=-1) post_softmax_attn = attn attn = self.dropout(attn) if talking_heads: attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous() out = einsum('b h i j, b h j d -> b h i d', attn, v) out = rearrange(out, 'b h n d -> b n (h d)') intermediates = Intermediates( pre_softmax_attn=pre_softmax_attn, post_softmax_attn=post_softmax_attn ) return self.to_out(out), intermediates class AttentionLayers(nn.Module): def __init__( self, dim, depth, heads=8, causal=False, cross_attend=False, only_cross=False, use_scalenorm=False, use_rmsnorm=False, use_rezero=False, rel_pos_num_buckets=32, rel_pos_max_distance=128, position_infused_attn=False, custom_layers=None, sandwich_coef=None, par_ratio=None, residual_attn=False, cross_residual_attn=False, macaron=False, pre_norm=True, gate_residual=False, **kwargs ): super().__init__() ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs) attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs) dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD) self.dim = dim self.depth = depth self.layers = nn.ModuleList([]) self.has_pos_emb = position_infused_attn self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None self.rotary_pos_emb = always(None) assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance' self.rel_pos = None self.pre_norm = pre_norm self.residual_attn = residual_attn self.cross_residual_attn = cross_residual_attn norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm norm_class = RMSNorm if use_rmsnorm else norm_class norm_fn = partial(norm_class, dim) norm_fn = nn.Identity if use_rezero else norm_fn branch_fn = Rezero if use_rezero else None if cross_attend and not only_cross: default_block = ('a', 'c', 'f') elif cross_attend and only_cross: default_block = ('c', 'f') else: default_block = ('a', 'f') if macaron: default_block = ('f',) + default_block if exists(custom_layers): layer_types = custom_layers elif exists(par_ratio): par_depth = depth * len(default_block) assert 1 < par_ratio <= par_depth, 'par ratio out of range' default_block = tuple(filter(not_equals('f'), default_block)) par_attn = par_depth // par_ratio depth_cut = par_depth * 2 // 3 # 2 / 3 attention layer cutoff suggested by PAR paper par_width = (depth_cut + depth_cut // par_attn) // par_attn assert len(default_block) <= par_width, 'default block is too large for par_ratio' par_block = default_block + ('f',) * (par_width - len(default_block)) par_head = par_block * par_attn layer_types = par_head + ('f',) * (par_depth - len(par_head)) elif exists(sandwich_coef): assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth' layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef else: layer_types = default_block * depth self.layer_types = layer_types self.num_attn_layers = len(list(filter(equals('a'), layer_types))) for layer_type in self.layer_types: if layer_type == 'a': layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs) elif layer_type == 'c': layer = Attention(dim, heads=heads, **attn_kwargs) elif layer_type == 'f': layer = FeedForward(dim, **ff_kwargs) layer = layer if not macaron else Scale(0.5, layer) else: raise Exception(f'invalid layer type {layer_type}') if isinstance(layer, Attention) and exists(branch_fn): layer = branch_fn(layer) if gate_residual: residual_fn = GRUGating(dim) else: residual_fn = Residual() self.layers.append(nn.ModuleList([ norm_fn(), layer, residual_fn ])) def forward( self, x, context=None, mask=None, context_mask=None, mems=None, return_hiddens=False, **kwargs ): hiddens = [] intermediates = [] prev_attn = None prev_cross_attn = None mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)): is_last = ind == (len(self.layers) - 1) if layer_type == 'a': hiddens.append(x) layer_mem = mems.pop(0) residual = x if self.pre_norm: x = norm(x) if layer_type == 'a': out, inter = block(x, mask=mask, sinusoidal_emb=self.pia_pos_emb, rel_pos=self.rel_pos, prev_attn=prev_attn, mem=layer_mem) elif layer_type == 'c': out, inter = block(x, context=context, mask=mask, context_mask=context_mask, prev_attn=prev_cross_attn) elif layer_type == 'f': out = block(x) x = residual_fn(out, residual) if layer_type in ('a', 'c'): intermediates.append(inter) if layer_type == 'a' and self.residual_attn: prev_attn = inter.pre_softmax_attn elif layer_type == 'c' and self.cross_residual_attn: prev_cross_attn = inter.pre_softmax_attn if not self.pre_norm and not is_last: x = norm(x) if return_hiddens: intermediates = LayerIntermediates( hiddens=hiddens, attn_intermediates=intermediates ) return x, intermediates return x class Encoder(AttentionLayers): def __init__(self, **kwargs): assert 'causal' not in kwargs, 'cannot set causality on encoder' super().__init__(causal=False, **kwargs) class TransformerWrapper(nn.Module): def __init__( self, *, num_tokens, max_seq_len, attn_layers, emb_dim=None, max_mem_len=0., emb_dropout=0., num_memory_tokens=None, tie_embedding=False, use_pos_emb=True ): super().__init__() assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder' dim = attn_layers.dim emb_dim = default(emb_dim, dim) self.max_seq_len = max_seq_len self.max_mem_len = max_mem_len self.num_tokens = num_tokens self.token_emb = nn.Embedding(num_tokens, emb_dim) self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if ( use_pos_emb and not attn_layers.has_pos_emb) else always(0) self.emb_dropout = nn.Dropout(emb_dropout) self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity() self.attn_layers = attn_layers self.norm = nn.LayerNorm(dim) self.init_() self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t() # memory tokens (like [cls]) from Memory Transformers paper num_memory_tokens = default(num_memory_tokens, 0) self.num_memory_tokens = num_memory_tokens if num_memory_tokens > 0: self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim)) # let funnel encoder know number of memory tokens, if specified if hasattr(attn_layers, 'num_memory_tokens'): attn_layers.num_memory_tokens = num_memory_tokens def init_(self): nn.init.normal_(self.token_emb.weight, std=0.02) def forward( self, x, return_embeddings=False, mask=None, return_mems=False, return_attn=False, mems=None, embedding_manager=None, **kwargs ): b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens embedded_x = self.token_emb(x) if embedding_manager: x = embedding_manager(x, embedded_x) else: x = embedded_x x = x + self.pos_emb(x) x = self.emb_dropout(x) x = self.project_emb(x) if num_mem > 0: mem = repeat(self.memory_tokens, 'n d -> b n d', b=b) x = torch.cat((mem, x), dim=1) # auto-handle masking after appending memory tokens if exists(mask): mask = F.pad(mask, (num_mem, 0), value=True) x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs) x = self.norm(x) mem, x = x[:, :num_mem], x[:, num_mem:] out = self.to_logits(x) if not return_embeddings else x if return_mems: hiddens = intermediates.hiddens new_mems = list(map(lambda pair: torch.cat(pair, dim=-2), zip(mems, hiddens))) if exists(mems) else hiddens new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems)) return out, new_mems if return_attn: attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates)) return out, attn_maps return out ================================================ FILE: ldm/util.py ================================================ import importlib import torch import numpy as np from collections import abc from einops import rearrange from functools import partial import multiprocessing as mp from threading import Thread from queue import Queue from inspect import isfunction from PIL import Image, ImageDraw, ImageFont def log_txt_as_img(wh, xc, size=10): # wh a tuple of (width, height) # xc a list of captions to plot b = len(xc) txts = list() for bi in range(b): txt = Image.new("RGB", wh, color="white") draw = ImageDraw.Draw(txt) font = ImageFont.truetype('data/DejaVuSans.ttf', size=size) nc = int(40 * (wh[0] / 256)) lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc)) try: draw.text((0, 0), lines, fill="black", font=font) except UnicodeEncodeError: print("Cant encode string for logging. Skipping.") txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0 txts.append(txt) txts = np.stack(txts) txts = torch.tensor(txts) return txts def ismap(x): if not isinstance(x, torch.Tensor): return False return (len(x.shape) == 4) and (x.shape[1] > 3) def isimage(x): if not isinstance(x, torch.Tensor): return False return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1) def exists(x): return x is not None def default(val, d): if exists(val): return val return d() if isfunction(d) else d def mean_flat(tensor): """ https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86 Take the mean over all non-batch dimensions. """ return tensor.mean(dim=list(range(1, len(tensor.shape)))) def count_params(model, verbose=False): total_params = sum(p.numel() for p in model.parameters()) if verbose: print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.") return total_params def instantiate_from_config(config, **kwargs): if not "target" in config: if config == '__is_first_stage__': return None elif config == "__is_unconditional__": return None raise KeyError("Expected key `target` to instantiate.") return get_obj_from_str(config["target"])(**config.get("params", dict()), **kwargs) def get_obj_from_str(string, reload=False): module, cls = string.rsplit(".", 1) if reload: module_imp = importlib.import_module(module) importlib.reload(module_imp) return getattr(importlib.import_module(module, package=None), cls) def _do_parallel_data_prefetch(func, Q, data, idx, idx_to_fn=False): # create dummy dataset instance # run prefetching if idx_to_fn: res = func(data, worker_id=idx) else: res = func(data) Q.put([idx, res]) Q.put("Done") def parallel_data_prefetch( func: callable, data, n_proc, target_data_type="ndarray", cpu_intensive=True, use_worker_id=False ): # if target_data_type not in ["ndarray", "list"]: # raise ValueError( # "Data, which is passed to parallel_data_prefetch has to be either of type list or ndarray." # ) if isinstance(data, np.ndarray) and target_data_type == "list": raise ValueError("list expected but function got ndarray.") elif isinstance(data, abc.Iterable): if isinstance(data, dict): print( f'WARNING:"data" argument passed to parallel_data_prefetch is a dict: Using only its values and disregarding keys.' ) data = list(data.values()) if target_data_type == "ndarray": data = np.asarray(data) else: data = list(data) else: raise TypeError( f"The data, that shall be processed parallel has to be either an np.ndarray or an Iterable, but is actually {type(data)}." ) if cpu_intensive: Q = mp.Queue(1000) proc = mp.Process else: Q = Queue(1000) proc = Thread # spawn processes if target_data_type == "ndarray": arguments = [ [func, Q, part, i, use_worker_id] for i, part in enumerate(np.array_split(data, n_proc)) ] else: step = ( int(len(data) / n_proc + 1) if len(data) % n_proc != 0 else int(len(data) / n_proc) ) arguments = [ [func, Q, part, i, use_worker_id] for i, part in enumerate( [data[i: i + step] for i in range(0, len(data), step)] ) ] processes = [] for i in range(n_proc): p = proc(target=_do_parallel_data_prefetch, args=arguments[i]) processes += [p] # start processes print(f"Start prefetching...") import time start = time.time() gather_res = [[] for _ in range(n_proc)] try: for p in processes: p.start() k = 0 while k < n_proc: # get result res = Q.get() if res == "Done": k += 1 else: gather_res[res[0]] = res[1] except Exception as e: print("Exception: ", e) for p in processes: p.terminate() raise e finally: for p in processes: p.join() print(f"Prefetching complete. [{time.time() - start} sec.]") if target_data_type == 'ndarray': if not isinstance(gather_res[0], np.ndarray): return np.concatenate([np.asarray(r) for r in gather_res], axis=0) # order outputs return np.concatenate(gather_res, axis=0) elif target_data_type == 'list': out = [] for r in gather_res: out.extend(r) return out else: return gather_res ================================================ FILE: main.py ================================================ import argparse, os, sys, datetime, glob, importlib, csv import numpy as np import time import torch import torchvision import pytorch_lightning as pl from packaging import version from omegaconf import OmegaConf from torch.utils.data import random_split, DataLoader, Dataset, Subset from functools import partial from PIL import Image from pytorch_lightning import seed_everything from pytorch_lightning.trainer import Trainer from pytorch_lightning.callbacks import ModelCheckpoint, Callback, LearningRateMonitor from pytorch_lightning.utilities.distributed import rank_zero_only from pytorch_lightning.utilities import rank_zero_info from ldm.data.base import Txt2ImgIterableBaseDataset from ldm.util import instantiate_from_config def load_model_from_config(config, ckpt, verbose=False): print(f"Loading model from {ckpt}") pl_sd = torch.load(ckpt, map_location="cpu") sd = pl_sd["state_dict"] config.model.params.ckpt_path = ckpt model = instantiate_from_config(config.model) m, u = model.load_state_dict(sd, strict=False) if len(m) > 0 and verbose: print("missing keys:") print(m) if len(u) > 0 and verbose: print("unexpected keys:") print(u) model.cuda() return model def get_parser(**parser_kwargs): def str2bool(v): if isinstance(v, bool): return v if v.lower() in ("yes", "true", "t", "y", "1"): return True elif v.lower() in ("no", "false", "f", "n", "0"): return False else: raise argparse.ArgumentTypeError("Boolean value expected.") parser = argparse.ArgumentParser(**parser_kwargs) parser.add_argument( "-n", "--name", type=str, const=True, default="", nargs="?", help="postfix for logdir", ) parser.add_argument( "-r", "--resume", type=str, const=True, default="", nargs="?", help="resume from logdir or checkpoint in logdir", ) parser.add_argument( "-b", "--base", nargs="*", metavar="base_config.yaml", help="paths to base configs. Loaded from left-to-right. " "Parameters can be overwritten or added with command-line options of the form `--key value`.", default=list(), ) parser.add_argument( "-t", "--train", type=str2bool, const=True, default=False, nargs="?", help="train", ) parser.add_argument( "--no-test", type=str2bool, const=True, default=False, nargs="?", help="disable test", ) parser.add_argument( "-p", "--project", help="name of new or path to existing project" ) parser.add_argument( "-d", "--debug", type=str2bool, nargs="?", const=True, default=False, help="enable post-mortem debugging", ) parser.add_argument( "-s", "--seed", type=int, default=23, help="seed for seed_everything", ) parser.add_argument( "-f", "--postfix", type=str, default="", help="post-postfix for default name", ) parser.add_argument( "-l", "--logdir", type=str, default="logs", help="directory for logging dat shit", ) parser.add_argument( "--scale_lr", type=str2bool, nargs="?", const=False, default=False, help="scale base-lr by ngpu * batch_size * n_accumulate", ) parser.add_argument( "--datadir_in_name", type=str2bool, nargs="?", const=True, default=True, help="Prepend the final directory in the data_root to the output directory name") parser.add_argument("--actual_resume", type=str, required=True, help="Path to model to actually resume from") parser.add_argument("--data_root", type=str, required=True, help="Path to directory with training images") parser.add_argument("--reg_data_root", type=str, required=True, help="Path to directory with regularization images") parser.add_argument("--embedding_manager_ckpt", type=str, default="", help="Initialize embedding manager from a checkpoint") parser.add_argument("--class_word", type=str, default="dog", help="Placeholder token which will be used to denote the concept in future prompts") parser.add_argument("--init_word", type=str, help="Word to use as source for initial token embedding") return parser def nondefault_trainer_args(opt): parser = argparse.ArgumentParser() parser = Trainer.add_argparse_args(parser) args = parser.parse_args([]) return sorted(k for k in vars(args) if getattr(opt, k) != getattr(args, k)) class WrappedDataset(Dataset): """Wraps an arbitrary object with __len__ and __getitem__ into a pytorch dataset""" def __init__(self, dataset): self.data = dataset def __len__(self): return len(self.data) def __getitem__(self, idx): return self.data[idx] def worker_init_fn(_): worker_info = torch.utils.data.get_worker_info() dataset = worker_info.dataset worker_id = worker_info.id if isinstance(dataset, Txt2ImgIterableBaseDataset): split_size = dataset.num_records // worker_info.num_workers # reset num_records to the true number to retain reliable length information dataset.sample_ids = dataset.valid_ids[worker_id * split_size:(worker_id + 1) * split_size] current_id = np.random.choice(len(np.random.get_state()[1]), 1) return np.random.seed(np.random.get_state()[1][current_id] + worker_id) else: return np.random.seed(np.random.get_state()[1][0] + worker_id) class ConcatDataset(Dataset): def __init__(self, *datasets): self.datasets = datasets def __getitem__(self, idx): return tuple(d[idx] for d in self.datasets) def __len__(self): return min(len(d) for d in self.datasets) class DataModuleFromConfig(pl.LightningDataModule): def __init__(self, batch_size, train=None, reg = None, validation=None, test=None, predict=None, wrap=False, num_workers=None, shuffle_test_loader=False, use_worker_init_fn=False, shuffle_val_dataloader=False): super().__init__() self.batch_size = batch_size self.dataset_configs = dict() self.num_workers = num_workers if num_workers is not None else batch_size * 2 self.use_worker_init_fn = use_worker_init_fn if train is not None: self.dataset_configs["train"] = train if reg is not None: self.dataset_configs["reg"] = reg self.train_dataloader = self._train_dataloader if validation is not None: self.dataset_configs["validation"] = validation self.val_dataloader = partial(self._val_dataloader, shuffle=shuffle_val_dataloader) if test is not None: self.dataset_configs["test"] = test self.test_dataloader = partial(self._test_dataloader, shuffle=shuffle_test_loader) if predict is not None: self.dataset_configs["predict"] = predict self.predict_dataloader = self._predict_dataloader self.wrap = wrap def prepare_data(self): for data_cfg in self.dataset_configs.values(): instantiate_from_config(data_cfg) def setup(self, stage=None): self.datasets = dict( (k, instantiate_from_config(self.dataset_configs[k])) for k in self.dataset_configs) if self.wrap: for k in self.datasets: self.datasets[k] = WrappedDataset(self.datasets[k]) def _train_dataloader(self): is_iterable_dataset = isinstance(self.datasets['train'], Txt2ImgIterableBaseDataset) if is_iterable_dataset or self.use_worker_init_fn: init_fn = worker_init_fn else: init_fn = None train_set = self.datasets["train"] reg_set = self.datasets["reg"] concat_dataset = ConcatDataset(train_set, reg_set) return DataLoader(concat_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False if is_iterable_dataset else True, worker_init_fn=init_fn) def _val_dataloader(self, shuffle=False): if isinstance(self.datasets['validation'], Txt2ImgIterableBaseDataset) or self.use_worker_init_fn: init_fn = worker_init_fn else: init_fn = None return DataLoader(self.datasets["validation"], batch_size=self.batch_size, num_workers=self.num_workers, worker_init_fn=init_fn, shuffle=shuffle) def _test_dataloader(self, shuffle=False): is_iterable_dataset = isinstance(self.datasets['train'], Txt2ImgIterableBaseDataset) if is_iterable_dataset or self.use_worker_init_fn: init_fn = worker_init_fn else: init_fn = None # do not shuffle dataloader for iterable dataset shuffle = shuffle and (not is_iterable_dataset) return DataLoader(self.datasets["test"], batch_size=self.batch_size, num_workers=self.num_workers, worker_init_fn=init_fn, shuffle=shuffle) def _predict_dataloader(self, shuffle=False): if isinstance(self.datasets['predict'], Txt2ImgIterableBaseDataset) or self.use_worker_init_fn: init_fn = worker_init_fn else: init_fn = None return DataLoader(self.datasets["predict"], batch_size=self.batch_size, num_workers=self.num_workers, worker_init_fn=init_fn) class SetupCallback(Callback): def __init__(self, resume, now, logdir, ckptdir, cfgdir, config, lightning_config): super().__init__() self.resume = resume self.now = now self.logdir = logdir self.ckptdir = ckptdir self.cfgdir = cfgdir self.config = config self.lightning_config = lightning_config def on_keyboard_interrupt(self, trainer, pl_module): if trainer.global_rank == 0: print("Summoning checkpoint.") ckpt_path = os.path.join(self.ckptdir, "last.ckpt") trainer.save_checkpoint(ckpt_path) def on_pretrain_routine_start(self, trainer, pl_module): if trainer.global_rank == 0: # Create logdirs and save configs os.makedirs(self.logdir, exist_ok=True) os.makedirs(self.ckptdir, exist_ok=True) os.makedirs(self.cfgdir, exist_ok=True) if "callbacks" in self.lightning_config: if 'metrics_over_trainsteps_checkpoint' in self.lightning_config['callbacks']: os.makedirs(os.path.join(self.ckptdir, 'trainstep_checkpoints'), exist_ok=True) print("Project config") print(OmegaConf.to_yaml(self.config)) OmegaConf.save(self.config, os.path.join(self.cfgdir, "{}-project.yaml".format(self.now))) print("Lightning config") print(OmegaConf.to_yaml(self.lightning_config)) OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}), os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now))) else: # ModelCheckpoint callback created log directory --- remove it if not self.resume and os.path.exists(self.logdir): dst, name = os.path.split(self.logdir) dst = os.path.join(dst, "child_runs", name) os.makedirs(os.path.split(dst)[0], exist_ok=True) try: os.rename(self.logdir, dst) except FileNotFoundError: pass class ImageLogger(Callback): def __init__(self, batch_frequency, max_images, clamp=True, increase_log_steps=True, rescale=True, disabled=False, log_on_batch_idx=False, log_first_step=False, log_images_kwargs=None): super().__init__() self.rescale = rescale self.batch_freq = batch_frequency self.max_images = max_images self.logger_log_images = { pl.loggers.TestTubeLogger: self._testtube, } self.log_steps = [2 ** n for n in range(int(np.log2(self.batch_freq)) + 1)] if not increase_log_steps: self.log_steps = [self.batch_freq] self.clamp = clamp self.disabled = disabled self.log_on_batch_idx = log_on_batch_idx self.log_images_kwargs = log_images_kwargs if log_images_kwargs else {} self.log_first_step = log_first_step @rank_zero_only def _testtube(self, pl_module, images, batch_idx, split): for k in images: grid = torchvision.utils.make_grid(images[k]) grid = (grid + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w tag = f"{split}/{k}" pl_module.logger.experiment.add_image( tag, grid, global_step=pl_module.global_step) @rank_zero_only def log_local(self, save_dir, split, images, global_step, current_epoch, batch_idx): root = os.path.join(save_dir, "images", split) for k in images: grid = torchvision.utils.make_grid(images[k], nrow=4) if self.rescale: grid = (grid + 1.0) / 2.0 # -1,1 -> 0,1; c,h,w grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1) grid = grid.numpy() grid = (grid * 255).astype(np.uint8) filename = "{}_gs-{:06}_e-{:06}_b-{:06}.jpg".format( k, global_step, current_epoch, batch_idx) path = os.path.join(root, filename) os.makedirs(os.path.split(path)[0], exist_ok=True) Image.fromarray(grid).save(path) def log_img(self, pl_module, batch, batch_idx, split="train"): check_idx = batch_idx if self.log_on_batch_idx else pl_module.global_step if (self.check_frequency(check_idx) and # batch_idx % self.batch_freq == 0 hasattr(pl_module, "log_images") and callable(pl_module.log_images) and self.max_images > 0): logger = type(pl_module.logger) is_train = pl_module.training if is_train: pl_module.eval() with torch.no_grad(): images = pl_module.log_images(batch, split=split, **self.log_images_kwargs) for k in images: N = min(images[k].shape[0], self.max_images) images[k] = images[k][:N] if isinstance(images[k], torch.Tensor): images[k] = images[k].detach().cpu() if self.clamp: images[k] = torch.clamp(images[k], -1., 1.) self.log_local(pl_module.logger.save_dir, split, images, pl_module.global_step, pl_module.current_epoch, batch_idx) logger_log_images = self.logger_log_images.get(logger, lambda *args, **kwargs: None) logger_log_images(pl_module, images, pl_module.global_step, split) if is_train: pl_module.train() def check_frequency(self, check_idx): if ((check_idx % self.batch_freq) == 0 or (check_idx in self.log_steps)) and ( check_idx > 0 or self.log_first_step): try: self.log_steps.pop(0) except IndexError as e: print(e) pass return True return False def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): if not self.disabled and (pl_module.global_step > 0 or self.log_first_step): self.log_img(pl_module, batch, batch_idx, split="train") def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): if not self.disabled and pl_module.global_step > 0: self.log_img(pl_module, batch, batch_idx, split="val") if hasattr(pl_module, 'calibrate_grad_norm'): if (pl_module.calibrate_grad_norm and batch_idx % 25 == 0) and batch_idx > 0: self.log_gradients(trainer, pl_module, batch_idx=batch_idx) class CUDACallback(Callback): # see https://github.com/SeanNaren/minGPT/blob/master/mingpt/callback.py def on_train_epoch_start(self, trainer, pl_module): # Reset the memory use counter torch.cuda.reset_peak_memory_stats(trainer.root_gpu) torch.cuda.synchronize(trainer.root_gpu) self.start_time = time.time() def on_train_epoch_end(self, trainer, pl_module): torch.cuda.synchronize(trainer.root_gpu) max_memory = torch.cuda.max_memory_allocated(trainer.root_gpu) / 2 ** 20 epoch_time = time.time() - self.start_time try: max_memory = trainer.training_type_plugin.reduce(max_memory) epoch_time = trainer.training_type_plugin.reduce(epoch_time) rank_zero_info(f"Average Epoch time: {epoch_time:.2f} seconds") rank_zero_info(f"Average Peak memory {max_memory:.2f}MiB") except AttributeError: pass class ModeSwapCallback(Callback): def __init__(self, swap_step=2000): super().__init__() self.is_frozen = False self.swap_step = swap_step def on_train_epoch_start(self, trainer, pl_module): if trainer.global_step < self.swap_step and not self.is_frozen: self.is_frozen = True trainer.optimizers = [pl_module.configure_opt_embedding()] if trainer.global_step > self.swap_step and self.is_frozen: self.is_frozen = False trainer.optimizers = [pl_module.configure_opt_model()] if __name__ == "__main__": now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") # add cwd for convenience and to make classes in this file available when # running as `python main.py` # (in particular `main.DataModuleFromConfig`) sys.path.append(os.getcwd()) parser = get_parser() parser = Trainer.add_argparse_args(parser) opt, unknown = parser.parse_known_args() if opt.name and opt.resume: raise ValueError( "-n/--name and -r/--resume cannot be specified both." "If you want to resume training in a new log folder, " "use -n/--name in combination with --resume_from_checkpoint" ) if opt.resume: if not os.path.exists(opt.resume): raise ValueError("Cannot find {}".format(opt.resume)) if os.path.isfile(opt.resume): paths = opt.resume.split("/") # idx = len(paths)-paths[::-1].index("logs")+1 # logdir = "/".join(paths[:idx]) logdir = "/".join(paths[:-2]) ckpt = opt.resume else: assert os.path.isdir(opt.resume), opt.resume logdir = opt.resume.rstrip("/") ckpt = os.path.join(logdir, "checkpoints", "last.ckpt") opt.resume_from_checkpoint = ckpt base_configs = sorted(glob.glob(os.path.join(logdir, "configs/*.yaml"))) opt.base = base_configs + opt.base _tmp = logdir.split("/") nowname = _tmp[-1] else: if opt.name: name = "_" + opt.name elif opt.base: cfg_fname = os.path.split(opt.base[0])[-1] cfg_name = os.path.splitext(cfg_fname)[0] name = "_" + cfg_name else: name = "" if opt.datadir_in_name: now = os.path.basename(os.path.normpath(opt.data_root)) + now nowname = now + name + opt.postfix logdir = os.path.join(opt.logdir, nowname) ckptdir = os.path.join(logdir, "checkpoints") cfgdir = os.path.join(logdir, "configs") seed_everything(opt.seed) try: # init and save configs configs = [OmegaConf.load(cfg) for cfg in opt.base] cli = OmegaConf.from_dotlist(unknown) config = OmegaConf.merge(*configs, cli) lightning_config = config.pop("lightning", OmegaConf.create()) # merge trainer cli with config trainer_config = lightning_config.get("trainer", OmegaConf.create()) # default to ddp trainer_config["accelerator"] = "ddp" for k in nondefault_trainer_args(opt): trainer_config[k] = getattr(opt, k) if not "gpus" in trainer_config: del trainer_config["accelerator"] cpu = True else: gpuinfo = trainer_config["gpus"] print(f"Running on GPUs {gpuinfo}") cpu = False trainer_opt = argparse.Namespace(**trainer_config) lightning_config.trainer = trainer_config # model # config.model.params.personalization_config.params.init_word = opt.init_word # config.model.params.personalization_config.params.embedding_manager_ckpt = opt.embedding_manager_ckpt # config.model.params.personalization_config.params.placeholder_tokens = opt.placeholder_tokens # if opt.init_word: # config.model.params.personalization_config.params.initializer_words[0] = opt.init_word config.data.params.train.params.placeholder_token = opt.class_word config.data.params.reg.params.placeholder_token = opt.class_word config.data.params.validation.params.placeholder_token = opt.class_word if opt.actual_resume: model = load_model_from_config(config, opt.actual_resume) else: model = instantiate_from_config(config.model) # trainer and callbacks trainer_kwargs = dict() # default logger configs default_logger_cfgs = { "wandb": { "target": "pytorch_lightning.loggers.WandbLogger", "params": { "name": nowname, "save_dir": logdir, "offline": opt.debug, "id": nowname, } }, "testtube": { "target": "pytorch_lightning.loggers.TestTubeLogger", "params": { "name": "testtube", "save_dir": logdir, } }, } default_logger_cfg = default_logger_cfgs["testtube"] if "logger" in lightning_config: logger_cfg = lightning_config.logger else: logger_cfg = OmegaConf.create() logger_cfg = OmegaConf.merge(default_logger_cfg, logger_cfg) trainer_kwargs["logger"] = instantiate_from_config(logger_cfg) # modelcheckpoint - use TrainResult/EvalResult(checkpoint_on=metric) to # specify which metric is used to determine best models default_modelckpt_cfg = { "target": "pytorch_lightning.callbacks.ModelCheckpoint", "params": { "dirpath": ckptdir, "filename": "{epoch:06}", "verbose": True, "save_last": True, } } if hasattr(model, "monitor"): print(f"Monitoring {model.monitor} as checkpoint metric.") default_modelckpt_cfg["params"]["monitor"] = model.monitor default_modelckpt_cfg["params"]["save_top_k"] = 1 if "modelcheckpoint" in lightning_config: modelckpt_cfg = lightning_config.modelcheckpoint else: modelckpt_cfg = OmegaConf.create() modelckpt_cfg = OmegaConf.merge(default_modelckpt_cfg, modelckpt_cfg) print(f"Merged modelckpt-cfg: \n{modelckpt_cfg}") if version.parse(pl.__version__) < version.parse('1.4.0'): trainer_kwargs["checkpoint_callback"] = instantiate_from_config(modelckpt_cfg) # add callback which sets up log directory default_callbacks_cfg = { "setup_callback": { "target": "main.SetupCallback", "params": { "resume": opt.resume, "now": now, "logdir": logdir, "ckptdir": ckptdir, "cfgdir": cfgdir, "config": config, "lightning_config": lightning_config, } }, "image_logger": { "target": "main.ImageLogger", "params": { "batch_frequency": 750, "max_images": 4, "clamp": True } }, "learning_rate_logger": { "target": "main.LearningRateMonitor", "params": { "logging_interval": "step", # "log_momentum": True } }, "cuda_callback": { "target": "main.CUDACallback" }, } if version.parse(pl.__version__) >= version.parse('1.4.0'): default_callbacks_cfg.update({'checkpoint_callback': modelckpt_cfg}) if "callbacks" in lightning_config: callbacks_cfg = lightning_config.callbacks else: callbacks_cfg = OmegaConf.create() if 'metrics_over_trainsteps_checkpoint' in callbacks_cfg: print( 'Caution: Saving checkpoints every n train steps without deleting. This might require some free space.') default_metrics_over_trainsteps_ckpt_dict = { 'metrics_over_trainsteps_checkpoint': {"target": 'pytorch_lightning.callbacks.ModelCheckpoint', 'params': { "dirpath": os.path.join(ckptdir, 'trainstep_checkpoints'), "filename": "{epoch:06}-{step:09}", "verbose": True, 'save_top_k': -1, 'every_n_train_steps': 10000, 'save_weights_only': True } } } default_callbacks_cfg.update(default_metrics_over_trainsteps_ckpt_dict) callbacks_cfg = OmegaConf.merge(default_callbacks_cfg, callbacks_cfg) if 'ignore_keys_callback' in callbacks_cfg and hasattr(trainer_opt, 'resume_from_checkpoint'): callbacks_cfg.ignore_keys_callback.params['ckpt_path'] = trainer_opt.resume_from_checkpoint elif 'ignore_keys_callback' in callbacks_cfg: del callbacks_cfg['ignore_keys_callback'] trainer_kwargs["callbacks"] = [instantiate_from_config(callbacks_cfg[k]) for k in callbacks_cfg] trainer_kwargs["max_steps"] = trainer_opt.max_steps trainer = Trainer.from_argparse_args(trainer_opt, **trainer_kwargs) trainer.logdir = logdir ### # data config.data.params.train.params.data_root = opt.data_root config.data.params.reg.params.data_root = opt.reg_data_root config.data.params.validation.params.data_root = opt.data_root data = instantiate_from_config(config.data) data = instantiate_from_config(config.data) # NOTE according to https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html # calling these ourselves should not be necessary but it is. # lightning still takes care of proper multiprocessing though data.prepare_data() data.setup() print("#### Data #####") for k in data.datasets: print(f"{k}, {data.datasets[k].__class__.__name__}, {len(data.datasets[k])}") # configure learning rate bs, base_lr = config.data.params.batch_size, config.model.base_learning_rate if not cpu: ngpu = len(lightning_config.trainer.gpus.strip(",").split(',')) else: ngpu = 1 if 'accumulate_grad_batches' in lightning_config.trainer: accumulate_grad_batches = lightning_config.trainer.accumulate_grad_batches else: accumulate_grad_batches = 1 print(f"accumulate_grad_batches = {accumulate_grad_batches}") lightning_config.trainer.accumulate_grad_batches = accumulate_grad_batches if opt.scale_lr: model.learning_rate = accumulate_grad_batches * ngpu * bs * base_lr print( "Setting learning rate to {:.2e} = {} (accumulate_grad_batches) * {} (num_gpus) * {} (batchsize) * {:.2e} (base_lr)".format( model.learning_rate, accumulate_grad_batches, ngpu, bs, base_lr)) else: model.learning_rate = base_lr print("++++ NOT USING LR SCALING ++++") print(f"Setting learning rate to {model.learning_rate:.2e}") # allow checkpointing via USR1 def melk(*args, **kwargs): # run all checkpoint hooks if trainer.global_rank == 0: print("Summoning checkpoint.") ckpt_path = os.path.join(ckptdir, "last.ckpt") trainer.save_checkpoint(ckpt_path) def divein(*args, **kwargs): if trainer.global_rank == 0: import pudb; pudb.set_trace() import signal signal.signal(signal.SIGUSR1, melk) signal.signal(signal.SIGUSR2, divein) # run if opt.train: try: trainer.fit(model, data) except Exception: melk() raise if not opt.no_test and not trainer.interrupted: trainer.test(model, data) except Exception: if opt.debug and trainer.global_rank == 0: try: import ipdb as debugger except ImportError: import pdb as debugger debugger.post_mortem() raise finally: # move newly created debug project to debug_runs if opt.debug and not opt.resume and trainer.global_rank == 0: dst, name = os.path.split(logdir) dst = os.path.join(dst, "debug_runs", name) os.makedirs(os.path.split(dst)[0], exist_ok=True) os.rename(logdir, dst) if trainer.global_rank == 0: print(trainer.profiler.summary()) ================================================ FILE: scripts/download_first_stages.sh ================================================ #!/bin/bash wget -O models/first_stage_models/kl-f4/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f4.zip wget -O models/first_stage_models/kl-f8/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f8.zip wget -O models/first_stage_models/kl-f16/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f16.zip wget -O models/first_stage_models/kl-f32/model.zip https://ommer-lab.com/files/latent-diffusion/kl-f32.zip wget -O models/first_stage_models/vq-f4/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4.zip wget -O models/first_stage_models/vq-f4-noattn/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f4-noattn.zip wget -O models/first_stage_models/vq-f8/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8.zip wget -O models/first_stage_models/vq-f8-n256/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f8-n256.zip wget -O models/first_stage_models/vq-f16/model.zip https://ommer-lab.com/files/latent-diffusion/vq-f16.zip cd models/first_stage_models/kl-f4 unzip -o model.zip cd ../kl-f8 unzip -o model.zip cd ../kl-f16 unzip -o model.zip cd ../kl-f32 unzip -o model.zip cd ../vq-f4 unzip -o model.zip cd ../vq-f4-noattn unzip -o model.zip cd ../vq-f8 unzip -o model.zip cd ../vq-f8-n256 unzip -o model.zip cd ../vq-f16 unzip -o model.zip cd ../.. ================================================ FILE: scripts/download_models.sh ================================================ #!/bin/bash wget -O models/ldm/celeba256/celeba-256.zip https://ommer-lab.com/files/latent-diffusion/celeba.zip wget -O models/ldm/ffhq256/ffhq-256.zip https://ommer-lab.com/files/latent-diffusion/ffhq.zip wget -O models/ldm/lsun_churches256/lsun_churches-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_churches.zip wget -O models/ldm/lsun_beds256/lsun_beds-256.zip https://ommer-lab.com/files/latent-diffusion/lsun_bedrooms.zip wget -O models/ldm/text2img256/model.zip https://ommer-lab.com/files/latent-diffusion/text2img.zip wget -O models/ldm/cin256/model.zip https://ommer-lab.com/files/latent-diffusion/cin.zip wget -O models/ldm/semantic_synthesis512/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis.zip wget -O models/ldm/semantic_synthesis256/model.zip https://ommer-lab.com/files/latent-diffusion/semantic_synthesis256.zip wget -O models/ldm/bsr_sr/model.zip https://ommer-lab.com/files/latent-diffusion/sr_bsr.zip wget -O models/ldm/layout2img-openimages256/model.zip https://ommer-lab.com/files/latent-diffusion/layout2img_model.zip wget -O models/ldm/inpainting_big/model.zip https://ommer-lab.com/files/latent-diffusion/inpainting_big.zip cd models/ldm/celeba256 unzip -o celeba-256.zip cd ../ffhq256 unzip -o ffhq-256.zip cd ../lsun_churches256 unzip -o lsun_churches-256.zip cd ../lsun_beds256 unzip -o lsun_beds-256.zip cd ../text2img256 unzip -o model.zip cd ../cin256 unzip -o model.zip cd ../semantic_synthesis512 unzip -o model.zip cd ../semantic_synthesis256 unzip -o model.zip cd ../bsr_sr unzip -o model.zip cd ../layout2img-openimages256 unzip -o model.zip cd ../inpainting_big unzip -o model.zip cd ../.. ================================================ FILE: scripts/sample_diffusion.py ================================================ import argparse, os, sys, glob, datetime, yaml import torch import time import numpy as np from tqdm import trange from omegaconf import OmegaConf from PIL import Image from ldm.models.diffusion.ddim import DDIMSampler from ldm.util import instantiate_from_config rescale = lambda x: (x + 1.) / 2. def custom_to_pil(x): x = x.detach().cpu() x = torch.clamp(x, -1., 1.) x = (x + 1.) / 2. x = x.permute(1, 2, 0).numpy() x = (255 * x).astype(np.uint8) x = Image.fromarray(x) if not x.mode == "RGB": x = x.convert("RGB") return x def custom_to_np(x): # saves the batch in adm style as in https://github.com/openai/guided-diffusion/blob/main/scripts/image_sample.py sample = x.detach().cpu() sample = ((sample + 1) * 127.5).clamp(0, 255).to(torch.uint8) sample = sample.permute(0, 2, 3, 1) sample = sample.contiguous() return sample def logs2pil(logs, keys=["sample"]): imgs = dict() for k in logs: try: if len(logs[k].shape) == 4: img = custom_to_pil(logs[k][0, ...]) elif len(logs[k].shape) == 3: img = custom_to_pil(logs[k]) else: print(f"Unknown format for key {k}. ") img = None except: img = None imgs[k] = img return imgs @torch.no_grad() def convsample(model, shape, return_intermediates=True, verbose=True, make_prog_row=False): if not make_prog_row: return model.p_sample_loop(None, shape, return_intermediates=return_intermediates, verbose=verbose) else: return model.progressive_denoising( None, shape, verbose=True ) @torch.no_grad() def convsample_ddim(model, steps, shape, eta=1.0 ): ddim = DDIMSampler(model) bs = shape[0] shape = shape[1:] samples, intermediates = ddim.sample(steps, batch_size=bs, shape=shape, eta=eta, verbose=False,) return samples, intermediates @torch.no_grad() def make_convolutional_sample(model, batch_size, vanilla=False, custom_steps=None, eta=1.0,): log = dict() shape = [batch_size, model.model.diffusion_model.in_channels, model.model.diffusion_model.image_size, model.model.diffusion_model.image_size] with model.ema_scope("Plotting"): t0 = time.time() if vanilla: sample, progrow = convsample(model, shape, make_prog_row=True) else: sample, intermediates = convsample_ddim(model, steps=custom_steps, shape=shape, eta=eta) t1 = time.time() x_sample = model.decode_first_stage(sample) log["sample"] = x_sample log["time"] = t1 - t0 log['throughput'] = sample.shape[0] / (t1 - t0) print(f'Throughput for this batch: {log["throughput"]}') return log def run(model, logdir, batch_size=50, vanilla=False, custom_steps=None, eta=None, n_samples=50000, nplog=None): if vanilla: print(f'Using Vanilla DDPM sampling with {model.num_timesteps} sampling steps.') else: print(f'Using DDIM sampling with {custom_steps} sampling steps and eta={eta}') tstart = time.time() n_saved = len(glob.glob(os.path.join(logdir,'*.png')))-1 # path = logdir if model.cond_stage_model is None: all_images = [] print(f"Running unconditional sampling for {n_samples} samples") for _ in trange(n_samples // batch_size, desc="Sampling Batches (unconditional)"): logs = make_convolutional_sample(model, batch_size=batch_size, vanilla=vanilla, custom_steps=custom_steps, eta=eta) n_saved = save_logs(logs, logdir, n_saved=n_saved, key="sample") all_images.extend([custom_to_np(logs["sample"])]) if n_saved >= n_samples: print(f'Finish after generating {n_saved} samples') break all_img = np.concatenate(all_images, axis=0) all_img = all_img[:n_samples] shape_str = "x".join([str(x) for x in all_img.shape]) nppath = os.path.join(nplog, f"{shape_str}-samples.npz") np.savez(nppath, all_img) else: raise NotImplementedError('Currently only sampling for unconditional models supported.') print(f"sampling of {n_saved} images finished in {(time.time() - tstart) / 60.:.2f} minutes.") def save_logs(logs, path, n_saved=0, key="sample", np_path=None): for k in logs: if k == key: batch = logs[key] if np_path is None: for x in batch: img = custom_to_pil(x) imgpath = os.path.join(path, f"{key}_{n_saved:06}.png") img.save(imgpath) n_saved += 1 else: npbatch = custom_to_np(batch) shape_str = "x".join([str(x) for x in npbatch.shape]) nppath = os.path.join(np_path, f"{n_saved}-{shape_str}-samples.npz") np.savez(nppath, npbatch) n_saved += npbatch.shape[0] return n_saved def get_parser(): parser = argparse.ArgumentParser() parser.add_argument( "-r", "--resume", type=str, nargs="?", help="load from logdir or checkpoint in logdir", ) parser.add_argument( "-n", "--n_samples", type=int, nargs="?", help="number of samples to draw", default=50000 ) parser.add_argument( "-e", "--eta", type=float, nargs="?", help="eta for ddim sampling (0.0 yields deterministic sampling)", default=1.0 ) parser.add_argument( "-v", "--vanilla_sample", default=False, action='store_true', help="vanilla sampling (default option is DDIM sampling)?", ) parser.add_argument( "-l", "--logdir", type=str, nargs="?", help="extra logdir", default="none" ) parser.add_argument( "-c", "--custom_steps", type=int, nargs="?", help="number of steps for ddim and fastdpm sampling", default=50 ) parser.add_argument( "--batch_size", type=int, nargs="?", help="the bs", default=10 ) return parser def load_model_from_config(config, sd): model = instantiate_from_config(config) model.load_state_dict(sd,strict=False) model.cuda() model.eval() return model def load_model(config, ckpt, gpu, eval_mode): if ckpt: print(f"Loading model from {ckpt}") pl_sd = torch.load(ckpt, map_location="cpu") global_step = pl_sd["global_step"] else: pl_sd = {"state_dict": None} global_step = None model = load_model_from_config(config.model, pl_sd["state_dict"]) return model, global_step if __name__ == "__main__": now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") sys.path.append(os.getcwd()) command = " ".join(sys.argv) parser = get_parser() opt, unknown = parser.parse_known_args() ckpt = None if not os.path.exists(opt.resume): raise ValueError("Cannot find {}".format(opt.resume)) if os.path.isfile(opt.resume): # paths = opt.resume.split("/") try: logdir = '/'.join(opt.resume.split('/')[:-1]) # idx = len(paths)-paths[::-1].index("logs")+1 print(f'Logdir is {logdir}') except ValueError: paths = opt.resume.split("/") idx = -2 # take a guess: path/to/logdir/checkpoints/model.ckpt logdir = "/".join(paths[:idx]) ckpt = opt.resume else: assert os.path.isdir(opt.resume), f"{opt.resume} is not a directory" logdir = opt.resume.rstrip("/") ckpt = os.path.join(logdir, "model.ckpt") base_configs = sorted(glob.glob(os.path.join(logdir, "config.yaml"))) opt.base = base_configs configs = [OmegaConf.load(cfg) for cfg in opt.base] cli = OmegaConf.from_dotlist(unknown) config = OmegaConf.merge(*configs, cli) gpu = True eval_mode = True if opt.logdir != "none": locallog = logdir.split(os.sep)[-1] if locallog == "": locallog = logdir.split(os.sep)[-2] print(f"Switching logdir from '{logdir}' to '{os.path.join(opt.logdir, locallog)}'") logdir = os.path.join(opt.logdir, locallog) print(config) model, global_step = load_model(config, ckpt, gpu, eval_mode) print(f"global step: {global_step}") print(75 * "=") print("logging to:") logdir = os.path.join(logdir, "samples", f"{global_step:08}", now) imglogdir = os.path.join(logdir, "img") numpylogdir = os.path.join(logdir, "numpy") os.makedirs(imglogdir) os.makedirs(numpylogdir) print(logdir) print(75 * "=") # write config out sampling_file = os.path.join(logdir, "sampling_config.yaml") sampling_conf = vars(opt) with open(sampling_file, 'w') as f: yaml.dump(sampling_conf, f, default_flow_style=False) print(sampling_conf) run(model, imglogdir, eta=opt.eta, vanilla=opt.vanilla_sample, n_samples=opt.n_samples, custom_steps=opt.custom_steps, batch_size=opt.batch_size, nplog=numpylogdir) print("done.") ================================================ FILE: scripts/score.py ================================================ import clip import torch import os from PIL import Image import lpips from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize from einops import repeat import numpy as np import pickle try: from torchvision.transforms import InterpolationMode BICUBIC = InterpolationMode.BICUBIC except ImportError: BICUBIC = Image.BICUBIC import argparse parser = argparse.ArgumentParser() parser.add_argument('--img_dir', type=str, required=True) parser.add_argument('--mode', type=str, default='K', choices=['K', 'beta']) parser.add_argument('--prompt', type=str, required=True) parser.add_argument('--orig_img', type=str, required=True) opt = parser.parse_args() def _transform(): return Compose([ Resize((512, 512), interpolation=BICUBIC), _convert_image_to_rgb, ToTensor(), # Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) def _convert_image_to_rgb(image): return image.convert("RGB") ORIG_IMAGE_PATH = opt.orig_img TGT_TEXT = opt.prompt IMAGE_DIR = opt.img_dir # import ipdb # ipdb.set_trace() if opt.mode == 'K': Xs = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] else: Xs = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] NUM_IMGS_PER_EXP = 20 device = "cuda" if torch.cuda.is_available() else "cpu" img_paths = [os.path.join(IMAGE_DIR, f) for f in sorted(os.listdir(IMAGE_DIR)) if os.path.isfile(os.path.join(IMAGE_DIR, f))] loss_fn = lpips.LPIPS(net='alex').to(device) orig_img = Image.open(ORIG_IMAGE_PATH) transform = _transform() orig_img = transform(orig_img).unsqueeze(0).to(device) # orig_img = repeat(orig_img, '1 c h w -> n c h w', n=len(img_paths)) imgs = [transform(Image.open(img_path)).unsqueeze(0).to(device) for img_path in img_paths] lpips_scores = [loss_fn(img, orig_img).item() for img in imgs] print(lpips_scores) model, preprocess = clip.load("ViT-B/32", device=device) imgs = [preprocess(Image.open(img_path)).unsqueeze(0).to(device) for img_path in img_paths] text_tokens = clip.tokenize(TGT_TEXT).to(device) with torch.no_grad(): text_features = model.encode_text(text_tokens) img_features = model.encode_image(torch.cat(imgs, dim=0)) img_features /= img_features.norm(dim=-1, keepdim=True) text_features /= text_features.norm(dim=-1, keepdim=True) similarity = (100.0 * img_features @ text_features.T).squeeze() print(similarity) l_score_exps = list() c_score_exps = list() x_axis = list() for i in range(len(img_paths)): if i % NUM_IMGS_PER_EXP == 0: l_score_exps.append(list()) c_score_exps.append(list()) x_axis.append([Xs[i//NUM_IMGS_PER_EXP]] * NUM_IMGS_PER_EXP) l_score_exps[-1].append(lpips_scores[i]) c_score_exps[-1].append(similarity[i].item()) result = dict() result['l_score_exps'] = l_score_exps result['c_score_exps'] = c_score_exps result['x_axis'] = x_axis pickle.dump(result, open(os.path.join(os.path.dirname(IMAGE_DIR), 'scores.pkl'), 'wb')) # import ipdb # ipdb.set_trace() # print([np.mean(l_score) for l_score in l_score_exps]) # print([np.mean(c_score) for c_score in c_score_exps]) ================================================ FILE: scripts/stable_txt2img_guidance.py ================================================ import argparse, os, sys, glob import torch import numpy as np from omegaconf import OmegaConf from PIL import Image from tqdm import tqdm, trange from itertools import islice from einops import rearrange from torchvision.utils import make_grid, save_image import time from pytorch_lightning import seed_everything from torch import autocast from contextlib import contextmanager, nullcontext from ldm.util import instantiate_from_config from ldm.models.diffusion.ddim import DDIMSampler from ldm.models.diffusion.plms import PLMSSampler def chunk(it, size): it = iter(it) return iter(lambda: tuple(islice(it, size)), ()) def load_model_from_config(config, ckpt, verbose=False): print(f"Loading model from {ckpt}") pl_sd = torch.load(ckpt, map_location="cpu") if "global_step" in pl_sd: print(f"Global Step: {pl_sd['global_step']}") sd = pl_sd["state_dict"] model = instantiate_from_config(config.model) m, u = model.load_state_dict(sd, strict=False) if len(m) > 0 and verbose: print("missing keys:") print(m) if len(u) > 0 and verbose: print("unexpected keys:") print(u) model.cuda() model.eval() return model def main(): parser = argparse.ArgumentParser() parser.add_argument( "--prompt", type=str, nargs="?", default="a painting of a virus monster playing guitar", help="the prompt to render" ) parser.add_argument( "--outdir", type=str, nargs="?", help="dir to write results to", default="outputs/txt2img-samples" ) parser.add_argument( "--skip_grid", action='store_true', help="do not save a grid, only individual samples. Helpful when evaluating lots of samples", ) parser.add_argument( "--skip_save", action='store_true', help="do not save individual samples. For speed measurements.", ) parser.add_argument( "--ddim_steps", type=int, default=50, help="number of ddim sampling steps", ) parser.add_argument( "--plms", action='store_true', help="use plms sampling", ) parser.add_argument( "--laion400m", action='store_true', help="uses the LAION400M model", ) parser.add_argument( "--fixed_code", action='store_true', help="if enabled, uses the same starting code across samples ", ) parser.add_argument( "--ddim_eta", type=float, default=0.0, help="ddim eta (eta=0.0 corresponds to deterministic sampling", ) parser.add_argument( "--n_iter", type=int, default=2, help="sample this often", ) parser.add_argument( "--H", type=int, default=512, help="image height, in pixel space", ) parser.add_argument( "--W", type=int, default=512, help="image width, in pixel space", ) parser.add_argument( "--C", type=int, default=4, help="latent channels", ) parser.add_argument( "--f", type=int, default=8, help="downsampling factor", ) parser.add_argument( "--n_samples", type=int, default=3, help="how many samples to produce for each given prompt. A.k.a. batch size", ) parser.add_argument( "--n_rows", type=int, default=0, help="rows in the grid (default: n_samples)", ) parser.add_argument( "--scale", type=float, default=7.5, help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))", ) parser.add_argument( "--from-file", type=str, help="if specified, load prompts from this file", ) parser.add_argument( "--config", type=str, default="configs/stable-diffusion/v1-inference.yaml", help="path to config which constructs model", ) parser.add_argument( "--ckpt", type=str, default="models/ldm/stable-diffusion-v4/sd-v1-4-full-ema.ckpt", help="path to checkpoint of model", ) parser.add_argument( "--sin_config", type=str, default="configs/stable-diffusion/v1-inference.yaml", ) parser.add_argument( "--sin_ckpt", type=str, required=True, ) parser.add_argument( "--seed", type=int, default=42, help="the seed (for reproducible sampling)", ) parser.add_argument( "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="autocast" ) parser.add_argument("--single_guidance", action="store_true") parser.add_argument("--range_t_max", type=int, default=400) parser.add_argument("--range_t_min", type=int, default=1) parser.add_argument("--cond_beta", type=float, default=0.5) parser.add_argument( "--embedding_path", type=str, help="Path to a pre-trained embedding manager checkpoint") opt = parser.parse_args() if opt.laion400m: print("Falling back to LAION 400M model...") opt.config = "configs/latent-diffusion/txt2img-1p4B-eval.yaml" opt.ckpt = "models/ldm/text2img-large/model.ckpt" opt.outdir = "outputs/txt2img-samples-laion400m" seed_everything(opt.seed) config = OmegaConf.load(f"{opt.config}") model = load_model_from_config(config, f"{opt.ckpt}") opt.cond_beta_sin = 1. - opt.cond_beta model.extra_config = vars(opt) #model.embedding_manager.load(opt.embedding_path) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model = model.to(device) sin_config = OmegaConf.load(f"{opt.sin_config}") sin_model = load_model_from_config(sin_config, f"{opt.sin_ckpt}") sin_model = sin_model.to(device) if opt.plms: sampler = PLMSSampler(model) else: if opt.single_guidance: from ldm.models.diffusion.guidance_ddim import DDIMSinSampler sampler = DDIMSinSampler(model, sin_model) else: sampler = DDIMSampler(model) os.makedirs(opt.outdir, exist_ok=True) outpath = opt.outdir batch_size = opt.n_samples n_rows = opt.n_rows if opt.n_rows > 0 else batch_size if not opt.from_file: prompt = opt.prompt assert prompt is not None data = [batch_size * [prompt]] else: print(f"reading prompts from {opt.from_file}") with open(opt.from_file, "r") as f: data = f.read().splitlines() data = list(chunk(data, batch_size)) sample_path = os.path.join(outpath, "samples") os.makedirs(sample_path, exist_ok=True) base_count = len(os.listdir(sample_path)) grid_count = len(os.listdir(outpath)) - 1 start_code = None if opt.fixed_code: start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=device) precision_scope = autocast if opt.precision=="autocast" else nullcontext with torch.no_grad(): with precision_scope("cuda"): with model.ema_scope(): with sin_model.ema_scope(): tic = time.time() all_samples = list() for n in trange(opt.n_iter, desc="Sampling"): for prompts in tqdm(data, desc="data"): uc = None if opt.scale != 1.0: uc = model.get_learned_conditioning(batch_size * [""]) uc_sin = sin_model.get_learned_conditioning(batch_size * [""]) if isinstance(prompts, tuple): prompts = list(prompts) if opt.single_guidance: b = len(prompts) prompt = prompts[0] prompts = [prompt.split('[SEP]')[0].strip()] * b prompts_single = [prompt.split('[SEP]')[1].strip()] * b c = model.get_learned_conditioning(prompts) c_sin = sin_model.get_learned_conditioning(prompts_single) shape = [opt.C, opt.H // opt.f, opt.W // opt.f] samples_ddim, _ = sampler.sample(S=opt.ddim_steps, conditioning=c, conditioning_single=c_sin, batch_size=opt.n_samples, shape=shape, verbose=False, unconditional_guidance_scale=opt.scale, unconditional_conditioning=uc, unconditional_conditioning_single=uc_sin, eta=opt.ddim_eta, x_T=start_code) x_samples_ddim = model.decode_first_stage(samples_ddim) x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) if not opt.skip_save: for x_sample in x_samples_ddim: x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c') Image.fromarray(x_sample.astype(np.uint8)).save( os.path.join(sample_path, f"{base_count:05}.jpg")) base_count += 1 if not opt.skip_grid: all_samples.append(x_samples_ddim) if not opt.skip_grid: # additionally, save as grid grid = torch.stack(all_samples, 0) grid = rearrange(grid, 'n b c h w -> (n b) c h w') for i in range(grid.size(0)): save_image(grid[i, :, :, :], os.path.join(outpath,opt.prompt+'_{}.png'.format(i))) grid = make_grid(grid, nrow=n_rows) # to image grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy() Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'{prompt.replace(" ", "-")}-{grid_count:04}.jpg')) grid_count += 1 toc = time.time() print(f"Your samples are ready and waiting for you here: \n{outpath} \n" f" \nEnjoy.") if __name__ == "__main__": main() ================================================ FILE: scripts/stable_txt2img_multi_guidance.py ================================================ import argparse, os, sys, glob import torch import numpy as np from omegaconf import OmegaConf from PIL import Image from tqdm import tqdm, trange from itertools import islice from einops import rearrange from torchvision.utils import make_grid, save_image import time from pytorch_lightning import seed_everything from torch import autocast from contextlib import contextmanager, nullcontext from ldm.util import instantiate_from_config from ldm.models.diffusion.ddim import DDIMSampler from ldm.models.diffusion.plms import PLMSSampler def chunk(it, size): it = iter(it) return iter(lambda: tuple(islice(it, size)), ()) def load_model_from_config(config, ckpt, verbose=False): print(f"Loading model from {ckpt}") pl_sd = torch.load(ckpt, map_location="cpu") if "global_step" in pl_sd: print(f"Global Step: {pl_sd['global_step']}") sd = pl_sd["state_dict"] model = instantiate_from_config(config.model) m, u = model.load_state_dict(sd, strict=False) if len(m) > 0 and verbose: print("missing keys:") print(m) if len(u) > 0 and verbose: print("unexpected keys:") print(u) model.cuda() model.eval() return model def main(): parser = argparse.ArgumentParser() parser.add_argument( "--prompt", type=str, nargs="?", default="a painting of a virus monster playing guitar", help="the prompt to render" ) parser.add_argument( "--outdir", type=str, nargs="?", help="dir to write results to", default="outputs/txt2img-samples" ) parser.add_argument( "--skip_grid", action='store_true', help="do not save a grid, only individual samples. Helpful when evaluating lots of samples", ) parser.add_argument( "--skip_save", action='store_true', help="do not save individual samples. For speed measurements.", ) parser.add_argument( "--ddim_steps", type=int, default=50, help="number of ddim sampling steps", ) parser.add_argument( "--plms", action='store_true', help="use plms sampling", ) parser.add_argument( "--laion400m", action='store_true', help="uses the LAION400M model", ) parser.add_argument( "--fixed_code", action='store_true', help="if enabled, uses the same starting code across samples ", ) parser.add_argument( "--ddim_eta", type=float, default=0.0, help="ddim eta (eta=0.0 corresponds to deterministic sampling", ) parser.add_argument( "--n_iter", type=int, default=2, help="sample this often", ) parser.add_argument( "--H", type=int, default=512, help="image height, in pixel space", ) parser.add_argument( "--W", type=int, default=512, help="image width, in pixel space", ) parser.add_argument( "--C", type=int, default=4, help="latent channels", ) parser.add_argument( "--f", type=int, default=8, help="downsampling factor", ) parser.add_argument( "--n_samples", type=int, default=3, help="how many samples to produce for each given prompt. A.k.a. batch size", ) parser.add_argument( "--n_rows", type=int, default=0, help="rows in the grid (default: n_samples)", ) parser.add_argument( "--scale", type=float, default=7.5, help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))", ) parser.add_argument( "--from-file", type=str, help="if specified, load prompts from this file", ) parser.add_argument( "--config", type=str, default="configs/stable-diffusion/v1-inference.yaml", help="path to config which constructs model", ) parser.add_argument( "--ckpt", type=str, default="models/ldm/stable-diffusion-v4/sd-v1-4-full-ema.ckpt", help="path to checkpoint of model", ) parser.add_argument( "--sin_config", type=str, default="configs/stable-diffusion/v1-inference.yaml", nargs='+' ) parser.add_argument( "--sin_ckpt", type=str, required=True, nargs='+' ) parser.add_argument( "--seed", type=int, default=42, help="the seed (for reproducible sampling)", ) parser.add_argument( "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="autocast" ) parser.add_argument("--single_guidance", action="store_true") parser.add_argument("--range_t_max", type=int, default=400, nargs='+') parser.add_argument("--range_t_min", type=int, default=1, nargs='+') parser.add_argument("--beta", type=float, default=0.5, nargs='+') parser.add_argument( "--embedding_path", type=str, help="Path to a pre-trained embedding manager checkpoint") opt = parser.parse_args() if opt.laion400m: print("Falling back to LAION 400M model...") opt.config = "configs/latent-diffusion/txt2img-1p4B-eval.yaml" opt.ckpt = "models/ldm/text2img-large/model.ckpt" opt.outdir = "outputs/txt2img-samples-laion400m" seed_everything(opt.seed) # import ipdb # ipdb.set_trace() config = OmegaConf.load(f"{opt.config}") model = load_model_from_config(config, f"{opt.ckpt}") #model.embedding_manager.load(opt.embedding_path) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model = model.to(device) guidance_configs = opt.sin_config guidance_ckpts = opt.sin_ckpt # import ipdb # ipdb.set_trace() assert len(guidance_configs) == len(guidance_ckpts) num_guidance = len(guidance_configs) guidance_betas = opt.beta if isinstance(opt.beta, list) else [opt.beta]*num_guidance guidance_range_t_max = opt.range_t_max if isinstance(opt.range_t_max, list) else [opt.range_t_max]*num_guidance guidance_range_t_min = opt.range_t_min if isinstance(opt.range_t_min, list) else [opt.range_t_min]*num_guidance assert len(guidance_betas) == num_guidance and len(guidance_range_t_max) == num_guidance and len(guidance_range_t_min) == num_guidance guidance_models = [load_model_from_config(OmegaConf.load(config), ckpt).to(device) for config, ckpt in zip(guidance_configs, guidance_ckpts)] for sin_model, beta, t_max, t_min in zip(guidance_models, guidance_betas, guidance_range_t_max, guidance_range_t_min): sin_model.extra_config = {"beta": beta, "range_t_max": t_max, "range_t_min": t_min} from ldm.models.diffusion.guidance_ddim import DDIMMultiSampler sampler = DDIMMultiSampler(model, guidance_models) os.makedirs(opt.outdir, exist_ok=True) outpath = opt.outdir batch_size = opt.n_samples n_rows = opt.n_rows if opt.n_rows > 0 else batch_size if not opt.from_file: prompt = opt.prompt assert prompt is not None data = [batch_size * [prompt]] else: print(f"reading prompts from {opt.from_file}") with open(opt.from_file, "r") as f: data = f.read().splitlines() data = list(chunk(data, batch_size)) sample_path = os.path.join(outpath, "samples") os.makedirs(sample_path, exist_ok=True) base_count = len(os.listdir(sample_path)) grid_count = len(os.listdir(outpath)) - 1 start_code = None if opt.fixed_code: start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=device) precision_scope = autocast if opt.precision=="autocast" else nullcontext with torch.no_grad(): with precision_scope("cuda"): with model.ema_scope(): for sin_model in guidance_models: sin_model.ema_scope() # with guidance_models[0].ema_scope(): tic = time.time() all_samples = list() for n in trange(opt.n_iter, desc="Sampling"): for prompts in tqdm(data, desc="data"): # import ipdb # ipdb.set_trace() uc = None if opt.scale != 1.0: uc = model.get_learned_conditioning(batch_size * [""]) uc_sin_list = [sin_model.get_learned_conditioning(batch_size * [""]) for sin_model in guidance_models] if isinstance(prompts, tuple): prompts = list(prompts) if opt.single_guidance: b = len(prompts) prompt = prompts[0] prompts = [prompt.split('[SEP]')[0].strip()] * b prompts_single = [[p.strip()] * b for p in prompt.split('[SEP]')[1:]] assert len(prompts_single) == num_guidance # import ipdb # ipdb.set_trace() c = model.get_learned_conditioning(prompts) c_sin_list = [sin_model.get_learned_conditioning(p) for sin_model, p in zip(guidance_models, prompts_single)] shape = [opt.C, opt.H // opt.f, opt.W // opt.f] samples_ddim, _ = sampler.sample(S=opt.ddim_steps, conditioning=c, conditioning_single_list=c_sin_list, batch_size=opt.n_samples, shape=shape, verbose=False, unconditional_guidance_scale=opt.scale, unconditional_conditioning=uc, unconditional_conditioning_single_list=uc_sin_list, eta=opt.ddim_eta, x_T=start_code) x_samples_ddim = model.decode_first_stage(samples_ddim) x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) if not opt.skip_save: for x_sample in x_samples_ddim: x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c') Image.fromarray(x_sample.astype(np.uint8)).save( os.path.join(sample_path, f"{base_count:05}.jpg")) base_count += 1 if not opt.skip_grid: all_samples.append(x_samples_ddim) if not opt.skip_grid: # additionally, save as grid grid = torch.stack(all_samples, 0) grid = rearrange(grid, 'n b c h w -> (n b) c h w') for i in range(grid.size(0)): save_image(grid[i, :, :, :], os.path.join(outpath,opt.prompt+'_{}.png'.format(i))) grid = make_grid(grid, nrow=n_rows) # to image grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy() Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'{prompt.replace(" ", "-")}-{grid_count:04}.jpg')) grid_count += 1 toc = time.time() print(f"Your samples are ready and waiting for you here: \n{outpath} \n" f" \nEnjoy.") if __name__ == "__main__": main() ================================================ FILE: setup.py ================================================ from setuptools import setup, find_packages setup( name='latent-diffusion', version='0.0.1', description='', packages=find_packages(), install_requires=[ 'torch', 'numpy', 'tqdm', ], )