Full Code of csslc/CCSR for AI

CCSR-v2.0 878f5adf2ba8 cached

80 files

842.8 KB

207.0k tokens

535 symbols

1 requests

Download .txt

Showing preview only (877K chars total). Download the full file or copy to clipboard to get everything.

Repository: csslc/CCSR
Branch: CCSR-v2.0
Commit: 878f5adf2ba8
Files: 80
Total size: 842.8 KB

Directory structure:
gitextract_wzfbvs4n/

├── .idea/
│   ├── CCSR.iml
│   ├── inspectionProfiles/
│   │   ├── Project_Default.xml
│   │   └── profiles_settings.xml
│   ├── modules.xml
│   ├── vcs.xml
│   └── workspace.xml
├── ADD/
│   ├── dnnlib/
│   │   ├── __init__.py
│   │   └── util.py
│   ├── layers/
│   │   ├── __init__.py
│   │   ├── attention.py
│   │   ├── block.py
│   │   ├── dino_head.py
│   │   ├── drop_path.py
│   │   ├── layer_scale.py
│   │   ├── mlp.py
│   │   ├── patch_embed.py
│   │   └── swiglu_ffn.py
│   ├── models/
│   │   ├── discriminator.py
│   │   └── vit.py
│   ├── th_utils/
│   │   ├── __init__.py
│   │   ├── custom_ops.py
│   │   ├── misc.py
│   │   └── ops/
│   │       ├── __init__.py
│   │       ├── bias_act.cpp
│   │       ├── bias_act.cu
│   │       ├── bias_act.h
│   │       ├── bias_act.py
│   │       ├── conv2d_gradfix.py
│   │       ├── conv2d_resample.py
│   │       ├── filtered_lrelu.cpp
│   │       ├── filtered_lrelu.cu
│   │       ├── filtered_lrelu.h
│   │       ├── filtered_lrelu.py
│   │       ├── filtered_lrelu_ns.cu
│   │       ├── filtered_lrelu_rd.cu
│   │       ├── filtered_lrelu_wr.cu
│   │       ├── fma.py
│   │       ├── grid_sample_gradfix.py
│   │       ├── upfirdn2d.cpp
│   │       ├── upfirdn2d.cu
│   │       ├── upfirdn2d.h
│   │       └── upfirdn2d.py
│   └── utils/
│       └── util_net.py
├── LICENSE
├── README.md
├── dataloaders/
│   ├── paired_dataset_txt.py
│   ├── params_ccsr.yml
│   └── realesrgan.py
├── models/
│   ├── DiffAugment.py
│   ├── controlnet.py
│   ├── losses/
│   │   ├── __init__.py
│   │   ├── contperceptual.py
│   │   └── vqperceptual.py
│   ├── shared.py
│   ├── unet_2d_blocks.py
│   ├── unet_2d_condition.py
│   └── vit_utils.py
├── myutils/
│   ├── devices.py
│   ├── img_util.py
│   ├── misc.py
│   ├── vaehook.py
│   └── wavelet_color_fix.py
├── pipelines/
│   └── pipeline_ccsr.py
├── requirements.txt
├── scripts/
│   ├── get_path.py
│   ├── test/
│   │   ├── test_ccsr_multistep.sh
│   │   ├── test_ccsr_onestep.sh
│   │   └── test_ccsr_tile.sh
│   └── train/
│       ├── train_ccsr_stage1.sh
│       ├── train_ccsr_stage2.sh
│       └── train_controlnet.sh
├── test_ccsr_tile.py
├── train_ccsr_stage1.py
├── train_ccsr_stage2.py
├── train_controlnet.py
└── utils/
    ├── devices.py
    ├── img_util.py
    ├── misc.py
    ├── vaehook.py
    └── wavelet_color_fix.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .idea/CCSR.iml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyDocumentationSettings">
    <option name="format" value="PLAIN" />
    <option name="myDocStringFormat" value="Plain" />
  </component>
</module>

================================================
FILE: .idea/inspectionProfiles/Project_Default.xml
================================================
<component name="InspectionProjectProfileManager">
  <profile version="1.0">
    <option name="myName" value="Project Default" />
    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
      <option name="ignoredPackages">
        <value>
          <list size="1">
            <item index="0" class="java.lang.String" itemvalue="opencv-python" />
          </list>
        </value>
      </option>
    </inspection_tool>
  </profile>
</component>

================================================
FILE: .idea/inspectionProfiles/profiles_settings.xml
================================================
<component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
</component>

================================================
FILE: .idea/modules.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/CCSR.iml" filepath="$PROJECT_DIR$/.idea/CCSR.iml" />
    </modules>
  </component>
</project>

================================================
FILE: .idea/vcs.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
  </component>
</project>

================================================
FILE: .idea/workspace.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="ChangeListManager">
    <list default="true" id="05c337e8-f34e-4a2e-abd3-dc65a3efd14a" name="Changes" comment="" />
    <option name="SHOW_DIALOG" value="false" />
    <option name="HIGHLIGHT_CONFLICTS" value="true" />
    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
    <option name="LAST_RESOLUTION" value="IGNORE" />
  </component>
  <component name="Git.Settings">
    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
  </component>
  <component name="MarkdownSettingsMigration">
    <option name="stateVersion" value="1" />
  </component>
  <component name="ProjectId" id="2qOgvG2MNzxubwwA97EeHYZVB9s" />
  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
  <component name="ProjectViewState">
    <option name="hideEmptyMiddlePackages" value="true" />
    <option name="showLibraryContents" value="true" />
  </component>
  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
  <component name="TaskManager">
    <task active="true" id="Default" summary="Default task">
      <changelist id="05c337e8-f34e-4a2e-abd3-dc65a3efd14a" name="Changes" comment="" />
      <created>1734539270044</created>
      <option name="number" value="Default" />
      <option name="presentableId" value="Default" />
      <updated>1734539270044</updated>
    </task>
    <servers />
  </component>
</project>

================================================
FILE: ADD/dnnlib/__init__.py
================================================
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

from .util import EasyDict, make_cache_dir_path


================================================
FILE: ADD/dnnlib/util.py
================================================
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

"""Miscellaneous utility classes and functions."""

import ctypes
import fnmatch
import importlib
import inspect
import os
import sys
import types
import io
import pickle
import re
import requests
import html
import hashlib
import glob
import tempfile
import urllib
import urllib.request
import uuid
from typing import Any, List, Tuple, Union, Optional
from distutils.util import strtobool
import shutil

import numpy as np


# Util classes
# ------------------------------------------------------------------------------------------

class EasyDict(dict):
    """Convenience class that behaves like a dict but allows access with the attribute syntax."""

    def __getattr__(self, name: str) -> Any:
        try:
            return self[name]
        except KeyError:
            raise AttributeError(name)

    def __setattr__(self, name: str, value: Any) -> None:
        self[name] = value

    def __delattr__(self, name: str) -> None:
        del self[name]


class Logger(object):
    """Redirect stderr to stdout, optionally print stdout to a file, and optionally force flushing on both stdout and the file."""

    def __init__(self, file_name: Optional[str] = None, file_mode: str = "w", should_flush: bool = True):
        self.file = None

        if file_name is not None:
            self.file = open(file_name, file_mode)

        self.should_flush = should_flush
        self.stdout = sys.stdout
        self.stderr = sys.stderr

        sys.stdout = self
        sys.stderr = self

    def __enter__(self) -> "Logger":
        return self

    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
        self.close()

    def write(self, text: Union[str, bytes]) -> None:
        """Write text to stdout (and a file) and optionally flush."""
        if isinstance(text, bytes):
            text = text.decode()
        if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash
            return

        if self.file is not None:
            self.file.write(text)

        self.stdout.write(text)

        if self.should_flush:
            self.flush()

    def flush(self) -> None:
        """Flush written text to both stdout and a file, if open."""
        if self.file is not None:
            self.file.flush()

        self.stdout.flush()

    def close(self) -> None:
        """Flush, close possible files, and remove stdout/stderr mirroring."""
        self.flush()

        # if using multiple loggers, prevent closing in wrong order
        if sys.stdout is self:
            sys.stdout = self.stdout
        if sys.stderr is self:
            sys.stderr = self.stderr

        if self.file is not None:
            self.file.close()
            self.file = None


# Cache directories
# ------------------------------------------------------------------------------------------

_dnnlib_cache_dir = None

def set_cache_dir(path: str) -> None:
    global _dnnlib_cache_dir
    _dnnlib_cache_dir = path


def make_cache_dir_path(*paths: str) -> str:
    if _dnnlib_cache_dir is not None:
        return os.path.join(_dnnlib_cache_dir, *paths)
    if 'DNNLIB_CACHE_DIR' in os.environ:
        return os.path.join(os.environ['DNNLIB_CACHE_DIR'], *paths)
    if 'HOME' in os.environ:
        return os.path.join(os.environ['HOME'], '.cache', 'dnnlib', *paths)
    if 'USERPROFILE' in os.environ:
        return os.path.join(os.environ['USERPROFILE'], '.cache', 'dnnlib', *paths)
    return os.path.join(tempfile.gettempdir(), '.cache', 'dnnlib', *paths)


# Small util functions
# ------------------------------------------------------------------------------------------

def format_time(seconds: Union[int, float]) -> str:
    """Convert the seconds to human readable string with days, hours, minutes and seconds."""
    s = int(np.rint(seconds))

    if s < 60:
        return "{0}s".format(s)
    elif s < 60 * 60:
        return "{0}m {1:02}s".format(s // 60, s % 60)
    elif s < 24 * 60 * 60:
        return "{0}h {1:02}m {2:02}s".format(s // (60 * 60), (s // 60) % 60, s % 60)
    else:
        return "{0}d {1:02}h {2:02}m".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24, (s // 60) % 60)


def format_time_brief(seconds: Union[int, float]) -> str:
    """Convert the seconds to human readable string with days, hours, minutes and seconds."""
    s = int(np.rint(seconds))

    if s < 60:
        return "{0}s".format(s)
    elif s < 60 * 60:
        return "{0}m {1:02}s".format(s // 60, s % 60)
    elif s < 24 * 60 * 60:
        return "{0}h {1:02}m".format(s // (60 * 60), (s // 60) % 60)
    else:
        return "{0}d {1:02}h".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24)


def ask_yes_no(question: str) -> bool:
    """Ask the user the question until the user inputs a valid answer."""
    while True:
        try:
            print("{0} [y/n]".format(question))
            return strtobool(input().lower())
        except ValueError:
            pass


def tuple_product(t: Tuple) -> Any:
    """Calculate the product of the tuple elements."""
    result = 1

    for v in t:
        result *= v

    return result


_str_to_ctype = {
    "uint8": ctypes.c_ubyte,
    "uint16": ctypes.c_uint16,
    "uint32": ctypes.c_uint32,
    "uint64": ctypes.c_uint64,
    "int8": ctypes.c_byte,
    "int16": ctypes.c_int16,
    "int32": ctypes.c_int32,
    "int64": ctypes.c_int64,
    "float32": ctypes.c_float,
    "float64": ctypes.c_double
}


def get_dtype_and_ctype(type_obj: Any) -> Tuple[np.dtype, Any]:
    """Given a type name string (or an object having a __name__ attribute), return matching Numpy and ctypes types that have the same size in bytes."""
    type_str = None

    if isinstance(type_obj, str):
        type_str = type_obj
    elif hasattr(type_obj, "__name__"):
        type_str = type_obj.__name__
    elif hasattr(type_obj, "name"):
        type_str = type_obj.name
    else:
        raise RuntimeError("Cannot infer type name from input")

    assert type_str in _str_to_ctype.keys()

    my_dtype = np.dtype(type_str)
    my_ctype = _str_to_ctype[type_str]

    assert my_dtype.itemsize == ctypes.sizeof(my_ctype)

    return my_dtype, my_ctype


def is_pickleable(obj: Any) -> bool:
    try:
        with io.BytesIO() as stream:
            pickle.dump(obj, stream)
        return True
    except:
        return False


# Functionality to import modules/objects by name, and call functions by name
# ------------------------------------------------------------------------------------------

def get_module_from_obj_name(obj_name: str) -> Tuple[types.ModuleType, str]:
    """Searches for the underlying module behind the name to some python object.
    Returns the module and the object name (original name with module part removed)."""

    # allow convenience shorthands, substitute them by full names
    obj_name = re.sub("^np.", "numpy.", obj_name)
    obj_name = re.sub("^tf.", "tensorflow.", obj_name)

    # list alternatives for (module_name, local_obj_name)
    parts = obj_name.split(".")
    name_pairs = [(".".join(parts[:i]), ".".join(parts[i:])) for i in range(len(parts), 0, -1)]

    # try each alternative in turn
    for module_name, local_obj_name in name_pairs:
        try:
            module = importlib.import_module(module_name) # may raise ImportError
            get_obj_from_module(module, local_obj_name) # may raise AttributeError
            return module, local_obj_name
        except:
            pass

    # maybe some of the modules themselves contain errors?
    for module_name, _local_obj_name in name_pairs:
        try:
            importlib.import_module(module_name) # may raise ImportError
        except ImportError:
            if not str(sys.exc_info()[1]).startswith("No module named '" + module_name + "'"):
                raise

    # maybe the requested attribute is missing?
    for module_name, local_obj_name in name_pairs:
        try:
            module = importlib.import_module(module_name) # may raise ImportError
            get_obj_from_module(module, local_obj_name) # may raise AttributeError
        except ImportError:
            pass

    # we are out of luck, but we have no idea why
    raise ImportError(obj_name)


def get_obj_from_module(module: types.ModuleType, obj_name: str) -> Any:
    """Traverses the object name and returns the last (rightmost) python object."""
    if obj_name == '':
        return module
    obj = module
    for part in obj_name.split("."):
        obj = getattr(obj, part)
    return obj


def get_obj_by_name(name: str) -> Any:
    """Finds the python object with the given name."""
    module, obj_name = get_module_from_obj_name(name)
    return get_obj_from_module(module, obj_name)


def call_func_by_name(*args, func_name: str = None, **kwargs) -> Any:
    """Finds the python object with the given name and calls it as a function."""
    assert func_name is not None
    func_obj = get_obj_by_name(func_name)
    assert callable(func_obj)
    return func_obj(*args, **kwargs)


def construct_class_by_name(*args, class_name: str = None, **kwargs) -> Any:
    """Finds the python class with the given name and constructs it with the given arguments."""
    return call_func_by_name(*args, func_name=class_name, **kwargs)


def get_module_dir_by_obj_name(obj_name: str) -> str:
    """Get the directory path of the module containing the given object name."""
    module, _ = get_module_from_obj_name(obj_name)
    return os.path.dirname(inspect.getfile(module))


def is_top_level_function(obj: Any) -> bool:
    """Determine whether the given object is a top-level function, i.e., defined at module scope using 'def'."""
    return callable(obj) and obj.__name__ in sys.modules[obj.__module__].__dict__


def get_top_level_function_name(obj: Any) -> str:
    """Return the fully-qualified name of a top-level function."""
    assert is_top_level_function(obj)
    module = obj.__module__
    if module == '__main__':
        module = os.path.splitext(os.path.basename(sys.modules[module].__file__))[0]
    return module + "." + obj.__name__


# File system helpers
# ------------------------------------------------------------------------------------------

def list_dir_recursively_with_ignore(dir_path: str, ignores: List[str] = None, add_base_to_relative: bool = False) -> List[Tuple[str, str]]:
    """List all files recursively in a given directory while ignoring given file and directory names.
    Returns list of tuples containing both absolute and relative paths."""
    assert os.path.isdir(dir_path)
    base_name = os.path.basename(os.path.normpath(dir_path))

    if ignores is None:
        ignores = []

    result = []

    for root, dirs, files in os.walk(dir_path, topdown=True):
        for ignore_ in ignores:
            dirs_to_remove = [d for d in dirs if fnmatch.fnmatch(d, ignore_)]

            # dirs need to be edited in-place
            for d in dirs_to_remove:
                dirs.remove(d)

            files = [f for f in files if not fnmatch.fnmatch(f, ignore_)]

        absolute_paths = [os.path.join(root, f) for f in files]
        relative_paths = [os.path.relpath(p, dir_path) for p in absolute_paths]

        if add_base_to_relative:
            relative_paths = [os.path.join(base_name, p) for p in relative_paths]

        assert len(absolute_paths) == len(relative_paths)
        result += zip(absolute_paths, relative_paths)

    return result


def copy_files_and_create_dirs(files: List[Tuple[str, str]]) -> None:
    """Takes in a list of tuples of (src, dst) paths and copies files.
    Will create all necessary directories."""
    for file in files:
        target_dir_name = os.path.dirname(file[1])

        # will create all intermediate-level directories
        if not os.path.exists(target_dir_name):
            os.makedirs(target_dir_name)

        shutil.copyfile(file[0], file[1])


# URL helpers
# ------------------------------------------------------------------------------------------

def is_url(obj: Any, allow_file_urls: bool = False) -> bool:
    """Determine whether the given object is a valid URL string."""
    if not isinstance(obj, str) or not "://" in obj:
        return False
    if allow_file_urls and obj.startswith('file://'):
        return True
    try:
        res = requests.compat.urlparse(obj)
        if not res.scheme or not res.netloc or not "." in res.netloc:
            return False
        res = requests.compat.urlparse(requests.compat.urljoin(obj, "/"))
        if not res.scheme or not res.netloc or not "." in res.netloc:
            return False
    except:
        return False
    return True


def open_url(url: str, cache_dir: str = None, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False, cache: bool = True) -> Any:
    """Download the given URL and return a binary-mode file object to access the data."""
    assert num_attempts >= 1
    assert not (return_filename and (not cache))

    # Doesn't look like an URL scheme so interpret it as a local filename.
    if not re.match('^[a-z]+://', url):
        return url if return_filename else open(url, "rb")

    # Handle file URLs.  This code handles unusual file:// patterns that
    # arise on Windows:
    #
    # file:///c:/foo.txt
    #
    # which would translate to a local '/c:/foo.txt' filename that's
    # invalid.  Drop the forward slash for such pathnames.
    #
    # If you touch this code path, you should test it on both Linux and
    # Windows.
    #
    # Some internet resources suggest using urllib.request.url2pathname() but
    # but that converts forward slashes to backslashes and this causes
    # its own set of problems.
    if url.startswith('file://'):
        filename = urllib.parse.urlparse(url).path
        if re.match(r'^/[a-zA-Z]:', filename):
            filename = filename[1:]
        return filename if return_filename else open(filename, "rb")

    assert is_url(url)

    # Lookup from cache.
    if cache_dir is None:
        cache_dir = make_cache_dir_path('downloads')

    url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
    if cache:
        cache_files = glob.glob(os.path.join(cache_dir, url_md5 + "_*"))
        if len(cache_files) == 1:
            filename = cache_files[0]
            return filename if return_filename else open(filename, "rb")

    # Download.
    url_name = None
    url_data = None
    with requests.Session() as session:
        if verbose:
            print("Downloading %s ..." % url, end="", flush=True)
        for attempts_left in reversed(range(num_attempts)):
            try:
                with session.get(url) as res:
                    res.raise_for_status()
                    if len(res.content) == 0:
                        raise IOError("No data received")

                    if len(res.content) < 8192:
                        content_str = res.content.decode("utf-8")
                        if "download_warning" in res.headers.get("Set-Cookie", ""):
                            links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link]
                            if len(links) == 1:
                                url = requests.compat.urljoin(url, links[0])
                                raise IOError("Google Drive virus checker nag")
                        if "Google Drive - Quota exceeded" in content_str:
                            raise IOError("Google Drive download quota exceeded -- please try again later")

                    match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", ""))
                    url_name = match[1] if match else url
                    url_data = res.content
                    if verbose:
                        print(" done")
                    break
            except KeyboardInterrupt:
                raise
            except:
                if not attempts_left:
                    if verbose:
                        print(" failed")
                    raise
                if verbose:
                    print(".", end="", flush=True)

    # Save to cache.
    if cache:
        safe_name = re.sub(r"[^0-9a-zA-Z-._]", "_", url_name)
        safe_name = safe_name[:min(len(safe_name), 128)]
        cache_file = os.path.join(cache_dir, url_md5 + "_" + safe_name)
        temp_file = os.path.join(cache_dir, "tmp_" + uuid.uuid4().hex + "_" + url_md5 + "_" + safe_name)
        os.makedirs(cache_dir, exist_ok=True)
        with open(temp_file, "wb") as f:
            f.write(url_data)
        os.replace(temp_file, cache_file) # atomic
        if return_filename:
            return cache_file

    # Return data as file object.
    assert not return_filename
    return io.BytesIO(url_data)


================================================
FILE: ADD/layers/__init__.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

from .dino_head import DINOHead
from .mlp import Mlp
from .patch_embed import PatchEmbed
from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
from .block import NestedTensorBlock
from .attention import MemEffAttention


================================================
FILE: ADD/layers/attention.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

# References:
#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py

import logging
import os
import warnings

from torch import Tensor
from torch import nn


logger = logging.getLogger("dinov2")


XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
try:
    if XFORMERS_ENABLED:
        from xformers.ops import memory_efficient_attention, unbind

        XFORMERS_AVAILABLE = True
        warnings.warn("xFormers is available (Attention)")
    else:
        warnings.warn("xFormers is disabled (Attention)")
        raise ImportError
except ImportError:
    XFORMERS_AVAILABLE = False
    warnings.warn("xFormers is not available (Attention)")


class Attention(nn.Module):
    def __init__(
        self,
        dim: int,
        num_heads: int = 8,
        qkv_bias: bool = False,
        proj_bias: bool = True,
        attn_drop: float = 0.0,
        proj_drop: float = 0.0,
    ) -> None:
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim**-0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim, bias=proj_bias)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x: Tensor) -> Tensor:
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)

        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
        attn = q @ k.transpose(-2, -1)

        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x


class MemEffAttention(Attention):
    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
        if not XFORMERS_AVAILABLE:
            if attn_bias is not None:
                raise AssertionError("xFormers is required for using nested tensors")
            return super().forward(x)

        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)

        q, k, v = unbind(qkv, 2)

        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
        x = x.reshape([B, N, C])

        x = self.proj(x)
        x = self.proj_drop(x)
        return x


================================================
FILE: ADD/layers/block.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

# References:
#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py

import logging
import os
from typing import Callable, List, Any, Tuple, Dict
import warnings

import torch
from torch import nn, Tensor

from .attention import Attention, MemEffAttention
from .drop_path import DropPath
from .layer_scale import LayerScale
from .mlp import Mlp


logger = logging.getLogger("dinov2")


XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
try:
    if XFORMERS_ENABLED:
        from xformers.ops import fmha, scaled_index_add, index_select_cat

        XFORMERS_AVAILABLE = True
        warnings.warn("xFormers is available (Block)")
    else:
        warnings.warn("xFormers is disabled (Block)")
        raise ImportError
except ImportError:
    XFORMERS_AVAILABLE = False

    warnings.warn("xFormers is not available (Block)")


class Block(nn.Module):
    def __init__(
        self,
        dim: int,
        num_heads: int,
        mlp_ratio: float = 4.0,
        qkv_bias: bool = False,
        proj_bias: bool = True,
        ffn_bias: bool = True,
        drop: float = 0.0,
        attn_drop: float = 0.0,
        init_values=None,
        drop_path: float = 0.0,
        act_layer: Callable[..., nn.Module] = nn.GELU,
        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
        attn_class: Callable[..., nn.Module] = Attention,
        ffn_layer: Callable[..., nn.Module] = Mlp,
    ) -> None:
        super().__init__()
        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
        self.norm1 = norm_layer(dim)
        self.attn = attn_class(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            proj_bias=proj_bias,
            attn_drop=attn_drop,
            proj_drop=drop,
        )
        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = ffn_layer(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop,
            bias=ffn_bias,
        )
        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

        self.sample_drop_ratio = drop_path

    def forward(self, x: Tensor) -> Tensor:
        def attn_residual_func(x: Tensor) -> Tensor:
            return self.ls1(self.attn(self.norm1(x)))

        def ffn_residual_func(x: Tensor) -> Tensor:
            return self.ls2(self.mlp(self.norm2(x)))

        if self.training and self.sample_drop_ratio > 0.1:
            # the overhead is compensated only for a drop path rate larger than 0.1
            x = drop_add_residual_stochastic_depth(
                x,
                residual_func=attn_residual_func,
                sample_drop_ratio=self.sample_drop_ratio,
            )
            x = drop_add_residual_stochastic_depth(
                x,
                residual_func=ffn_residual_func,
                sample_drop_ratio=self.sample_drop_ratio,
            )
        elif self.training and self.sample_drop_ratio > 0.0:
            x = x + self.drop_path1(attn_residual_func(x))
            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
        else:
            x = x + attn_residual_func(x)
            x = x + ffn_residual_func(x)
        return x


def drop_add_residual_stochastic_depth(
    x: Tensor,
    residual_func: Callable[[Tensor], Tensor],
    sample_drop_ratio: float = 0.0,
) -> Tensor:
    # 1) extract subset using permutation
    b, n, d = x.shape
    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
    x_subset = x[brange]

    # 2) apply residual_func to get residual
    residual = residual_func(x_subset)

    x_flat = x.flatten(1)
    residual = residual.flatten(1)

    residual_scale_factor = b / sample_subset_size

    # 3) add the residual
    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
    return x_plus_residual.view_as(x)


def get_branges_scales(x, sample_drop_ratio=0.0):
    b, n, d = x.shape
    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
    residual_scale_factor = b / sample_subset_size
    return brange, residual_scale_factor


def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
    if scaling_vector is None:
        x_flat = x.flatten(1)
        residual = residual.flatten(1)
        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
    else:
        x_plus_residual = scaled_index_add(
            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
        )
    return x_plus_residual


attn_bias_cache: Dict[Tuple, Any] = {}


def get_attn_bias_and_cat(x_list, branges=None):
    """
    this will perform the index select, cat the tensors, and provide the attn_bias from cache
    """
    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
    if all_shapes not in attn_bias_cache.keys():
        seqlens = []
        for b, x in zip(batch_sizes, x_list):
            for _ in range(b):
                seqlens.append(x.shape[1])
        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
        attn_bias._batch_sizes = batch_sizes
        attn_bias_cache[all_shapes] = attn_bias

    if branges is not None:
        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
    else:
        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
        cat_tensors = torch.cat(tensors_bs1, dim=1)

    return attn_bias_cache[all_shapes], cat_tensors


def drop_add_residual_stochastic_depth_list(
    x_list: List[Tensor],
    residual_func: Callable[[Tensor, Any], Tensor],
    sample_drop_ratio: float = 0.0,
    scaling_vector=None,
) -> Tensor:
    # 1) generate random set of indices for dropping samples in the batch
    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
    branges = [s[0] for s in branges_scales]
    residual_scale_factors = [s[1] for s in branges_scales]

    # 2) get attention bias and index+concat the tensors
    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)

    # 3) apply residual_func to get residual, and split the result
    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore

    outputs = []
    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
    return outputs


class NestedTensorBlock(Block):
    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
        """
        x_list contains a list of tensors to nest together and run
        """
        assert isinstance(self.attn, MemEffAttention)

        if self.training and self.sample_drop_ratio > 0.0:

            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
                return self.attn(self.norm1(x), attn_bias=attn_bias)

            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
                return self.mlp(self.norm2(x))

            x_list = drop_add_residual_stochastic_depth_list(
                x_list,
                residual_func=attn_residual_func,
                sample_drop_ratio=self.sample_drop_ratio,
                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
            )
            x_list = drop_add_residual_stochastic_depth_list(
                x_list,
                residual_func=ffn_residual_func,
                sample_drop_ratio=self.sample_drop_ratio,
                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
            )
            return x_list
        else:

            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))

            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
                return self.ls2(self.mlp(self.norm2(x)))

            attn_bias, x = get_attn_bias_and_cat(x_list)
            x = x + attn_residual_func(x, attn_bias=attn_bias)
            x = x + ffn_residual_func(x)
            return attn_bias.split(x)

    def forward(self, x_or_x_list):
        if isinstance(x_or_x_list, Tensor):
            return super().forward(x_or_x_list)
        elif isinstance(x_or_x_list, list):
            if not XFORMERS_AVAILABLE:
                raise AssertionError("xFormers is required for using nested tensors")
            return self.forward_nested(x_or_x_list)
        else:
            raise AssertionError


================================================
FILE: ADD/layers/dino_head.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

import torch
import torch.nn as nn
from torch.nn.init import trunc_normal_
from torch.nn.utils import weight_norm


class DINOHead(nn.Module):
    def __init__(
        self,
        in_dim,
        out_dim,
        use_bn=False,
        nlayers=3,
        hidden_dim=2048,
        bottleneck_dim=256,
        mlp_bias=True,
    ):
        super().__init__()
        nlayers = max(nlayers, 1)
        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
        self.apply(self._init_weights)
        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
        self.last_layer.weight_g.data.fill_(1)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=0.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        x = self.mlp(x)
        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
        x = self.last_layer(x)
        return x


def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
    if nlayers == 1:
        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
    else:
        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
        if use_bn:
            layers.append(nn.BatchNorm1d(hidden_dim))
        layers.append(nn.GELU())
        for _ in range(nlayers - 2):
            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
            if use_bn:
                layers.append(nn.BatchNorm1d(hidden_dim))
            layers.append(nn.GELU())
        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
        return nn.Sequential(*layers)


================================================
FILE: ADD/layers/drop_path.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

# References:
#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py


from torch import nn


def drop_path(x, drop_prob: float = 0.0, training: bool = False):
    if drop_prob == 0.0 or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
    if keep_prob > 0.0:
        random_tensor.div_(keep_prob)
    output = x * random_tensor
    return output


class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


================================================
FILE: ADD/layers/layer_scale.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110

from typing import Union

import torch
from torch import Tensor
from torch import nn


class LayerScale(nn.Module):
    def __init__(
        self,
        dim: int,
        init_values: Union[float, Tensor] = 1e-5,
        inplace: bool = False,
    ) -> None:
        super().__init__()
        self.inplace = inplace
        self.gamma = nn.Parameter(init_values * torch.ones(dim))

    def forward(self, x: Tensor) -> Tensor:
        return x.mul_(self.gamma) if self.inplace else x * self.gamma


================================================
FILE: ADD/layers/mlp.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

# References:
#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py


from typing import Callable, Optional

from torch import Tensor, nn


class Mlp(nn.Module):
    def __init__(
        self,
        in_features: int,
        hidden_features: Optional[int] = None,
        out_features: Optional[int] = None,
        act_layer: Callable[..., nn.Module] = nn.GELU,
        drop: float = 0.0,
        bias: bool = True,
    ) -> None:
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
        self.drop = nn.Dropout(drop)

    def forward(self, x: Tensor) -> Tensor:
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


================================================
FILE: ADD/layers/patch_embed.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

# References:
#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py

from typing import Callable, Optional, Tuple, Union

from torch import Tensor
import torch.nn as nn


def make_2tuple(x):
    if isinstance(x, tuple):
        assert len(x) == 2
        return x

    assert isinstance(x, int)
    return (x, x)


class PatchEmbed(nn.Module):
    """
    2D image to patch embedding: (B,C,H,W) -> (B,N,D)

    Args:
        img_size: Image size.
        patch_size: Patch token size.
        in_chans: Number of input image channels.
        embed_dim: Number of linear projection output channels.
        norm_layer: Normalization layer.
    """

    def __init__(
        self,
        img_size: Union[int, Tuple[int, int]] = 224,
        patch_size: Union[int, Tuple[int, int]] = 16,
        in_chans: int = 3,
        embed_dim: int = 768,
        norm_layer: Optional[Callable] = None,
        flatten_embedding: bool = True,
    ) -> None:
        super().__init__()

        image_HW = make_2tuple(img_size)
        patch_HW = make_2tuple(patch_size)
        patch_grid_size = (
            image_HW[0] // patch_HW[0],
            image_HW[1] // patch_HW[1],
        )

        self.img_size = image_HW
        self.patch_size = patch_HW
        self.patches_resolution = patch_grid_size
        self.num_patches = patch_grid_size[0] * patch_grid_size[1]

        #self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.flatten_embedding = flatten_embedding

        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

    def forward(self, x: Tensor) -> Tensor:
        _, _, H, W = x.shape
        patch_H, patch_W = self.patch_size

        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"

        x = self.proj(x)  # B C H W
        H, W = x.size(2), x.size(3)
        x = x.flatten(2).transpose(1, 2)  # B HW C
        x = self.norm(x)
        if not self.flatten_embedding:
            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
        return x

    #def flops(self) -> float:
        #Ho, Wo = self.patches_resolution
        #flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
        #if self.norm is not None:
         #   flops += Ho * Wo * self.embed_dim
        #return flops


================================================
FILE: ADD/layers/swiglu_ffn.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

import os
from typing import Callable, Optional
import warnings

from torch import Tensor, nn
import torch.nn.functional as F


class SwiGLUFFN(nn.Module):
    def __init__(
        self,
        in_features: int,
        hidden_features: Optional[int] = None,
        out_features: Optional[int] = None,
        act_layer: Callable[..., nn.Module] = None,
        drop: float = 0.0,
        bias: bool = True,
    ) -> None:
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)

    def forward(self, x: Tensor) -> Tensor:
        x12 = self.w12(x)
        x1, x2 = x12.chunk(2, dim=-1)
        hidden = F.silu(x1) * x2
        return self.w3(hidden)


XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
try:
    if XFORMERS_ENABLED:
        from xformers.ops import SwiGLU

        XFORMERS_AVAILABLE = True
        warnings.warn("xFormers is available (SwiGLU)")
    else:
        warnings.warn("xFormers is disabled (SwiGLU)")
        raise ImportError
except ImportError:
    SwiGLU = SwiGLUFFN
    XFORMERS_AVAILABLE = False

    warnings.warn("xFormers is not available (SwiGLU)")


class SwiGLUFFNFused(SwiGLU):
    def __init__(
        self,
        in_features: int,
        hidden_features: Optional[int] = None,
        out_features: Optional[int] = None,
        act_layer: Callable[..., nn.Module] = None,
        drop: float = 0.0,
        bias: bool = True,
    ) -> None:
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
        super().__init__(
            in_features=in_features,
            hidden_features=hidden_features,
            out_features=out_features,
            bias=bias,
        )


================================================
FILE: ADD/models/discriminator.py
================================================
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

"""
Projected discriminator architecture from
"StyleGAN-T: Unlocking the Power of GANs for Fast Large-Scale Text-to-Image Synthesis".
"""

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.spectral_norm import SpectralNorm
from torchvision.transforms import RandomCrop, Normalize
import timm
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

from ADD.th_utils import misc
from models.shared import ResidualBlock, FullyConnectedLayer
from models.vit_utils import make_vit_backbone, forward_vit, make_sd_backbone
from models.DiffAugment import DiffAugment
from ADD.utils.util_net import reload_model_

from functools import partial

class SpectralConv1d(nn.Conv1d):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        SpectralNorm.apply(self, name='weight', n_power_iterations=1, dim=0, eps=1e-12)


class BatchNormLocal(nn.Module):
    def __init__(self, num_features: int, affine: bool = True, virtual_bs: int = 3, eps: float = 1e-5):
        super().__init__()
        self.virtual_bs = virtual_bs
        self.eps = eps
        self.affine = affine

        if self.affine:
            self.weight = nn.Parameter(torch.ones(num_features))
            self.bias = nn.Parameter(torch.zeros(num_features))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        shape = x.size()

        # Reshape batch into groups.
        G = np.ceil(x.size(0)/self.virtual_bs).astype(int)
        x = x.view(G, -1, x.size(-2), x.size(-1))

        # Calculate stats.
        mean = x.mean([1, 3], keepdim=True)
        var = x.var([1, 3], keepdim=True, unbiased=False)
        x = (x - mean) / (torch.sqrt(var + self.eps))

        if self.affine:
            x = x * self.weight[None, :, None] + self.bias[None, :, None]

        return x.view(shape)


def make_block(channels: int, kernel_size: int) -> nn.Module:
    return nn.Sequential(
        SpectralConv1d(
            channels,
            channels,
            kernel_size = kernel_size,
            padding = kernel_size//2,
            padding_mode = 'circular',
        ),
        #BatchNormLocal(channels),
        nn.GroupNorm(4, channels),
        nn.LeakyReLU(0.2, True),
    )

class DiscHead(nn.Module):
    def __init__(self, channels: int, c_dim: int, cmap_dim: int = 64):
        super().__init__()
        self.channels = channels
        self.c_dim = c_dim
        self.cmap_dim = cmap_dim

        self.main = nn.Sequential(
            make_block(channels, kernel_size=1),
            ResidualBlock(make_block(channels, kernel_size=9))
        )

        if self.c_dim > 0:
            self.cmapper = FullyConnectedLayer(self.c_dim, cmap_dim)
            self.cls = SpectralConv1d(channels, cmap_dim, kernel_size=1, padding=0)
        else:
            self.cls = SpectralConv1d(channels, 1, kernel_size=1, padding=0)

    def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
        h = self.main(x)
        out = self.cls(h)

        if self.c_dim > 0:
            cmap = self.cmapper(c).unsqueeze(-1)
            out = (out * cmap).sum(1, keepdim=True) * (1 / np.sqrt(self.cmap_dim))

        return out

class DINO(torch.nn.Module):
    def __init__(self, hooks: list[int] = [2,5,8,11], hook_patch: bool = True):
        super().__init__()
        self.n_hooks = len(hooks) + int(hook_patch)

        self.model = make_vit_backbone(
            timm.create_model('vit_small_patch16_224.dino', pretrained=False),
            patch_size=[16,16], hooks=hooks, hook_patch=hook_patch,
        )
        reload_model_(self.model, torch.load('preset/models/dino/dino_deitsmall16_pretrain.pth'))
        self.model = self.model.eval().requires_grad_(False)


        self.img_resolution = self.model.model.patch_embed.img_size[0]
        self.embed_dim = self.model.model.embed_dim
        self.norm = Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        ''' input: x in [0, 1]; output: dict of activations '''
        x = F.interpolate(x, self.img_resolution, mode='area')
        x = self.norm(x)
        features = forward_vit(self.model, x)
        return features


class ProjectedDiscriminator(nn.Module):
    def __init__(self, c_dim: int, diffaug: bool = True, p_crop: float = 0.5):
        super().__init__()
        self.c_dim = c_dim
        self.diffaug = diffaug
        self.p_crop = p_crop

        self.dino = DINO()

        heads = []
        for i in range(self.dino.n_hooks):
            heads += [str(i), DiscHead(self.dino.embed_dim, c_dim)],
        self.heads = nn.ModuleDict(heads)

    def train(self, mode: bool = True):
        self.dino = self.dino.train(False)
        self.heads = self.heads.train(mode)
        return self

    def eval(self):
        return self.train(False)

    def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
        # Apply augmentation (x in [-1, 1]).
        if self.diffaug:
            x = DiffAugment(x, policy='translation,cutout')

        # Transform to [0, 1].
        x = x.add(1).div(2)

        # Take crops with probablity p_crop if the image is larger.
        if x.size(-1) > self.dino.img_resolution and np.random.random() < self.p_crop:
            x = RandomCrop(self.dino.img_resolution)(x)

        # Forward pass through DINO ViT.
        features = self.dino(x)

        # Apply discriminator heads.
        logits = []
        for k, head in self.heads.items():
            features[k].requires_grad_(True)
            logits.append(head(features[k], c).view(x.size(0), -1))
        #logits = torch.cat(logits, dim=1)

        return logits, features



================================================
FILE: ADD/models/vit.py
================================================
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the Apache License, Version 2.0
# found in the LICENSE file in the root directory of this source tree.

# References:
#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py

from functools import partial
import math
import logging
from typing import Sequence, Tuple, Union, Callable

import torch
import torch.nn as nn
import torch.utils.checkpoint
from torch.nn.init import trunc_normal_

from ADD.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block


logger = logging.getLogger("dinov2")


def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
    if not depth_first and include_root:
        fn(module=module, name=name)
    for child_name, child_module in module.named_children():
        child_name = ".".join((name, child_name)) if name else child_name
        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
    if depth_first and include_root:
        fn(module=module, name=name)
    return module


class BlockChunk(nn.ModuleList):
    def forward(self, x):
        for b in self:
            x = b(x)
        return x


class DinoVisionTransformer(nn.Module):
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4.0,
        qkv_bias=True,
        ffn_bias=True,
        proj_bias=True,
        drop_path_rate=0.0,
        drop_path_uniform=False,
        init_values=None,  # for layerscale: None or 0 => no layerscale
        embed_layer=PatchEmbed,
        act_layer=nn.GELU,
        block_fn=Block,
        ffn_layer="mlp",
        block_chunks=1,
        num_register_tokens=0,
        interpolate_antialias=False,
        interpolate_offset=0.1,
    ):
        """
        Args:
            img_size (int, tuple): input image size
            patch_size (int, tuple): patch size
            in_chans (int): number of input channels
            embed_dim (int): embedding dimension
            depth (int): depth of transformer
            num_heads (int): number of attention heads
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
            qkv_bias (bool): enable bias for qkv if True
            proj_bias (bool): enable bias for proj in attn if True
            ffn_bias (bool): enable bias for ffn if True
            drop_path_rate (float): stochastic depth rate
            drop_path_uniform (bool): apply uniform drop rate across blocks
            weight_init (str): weight init scheme
            init_values (float): layer-scale init values
            embed_layer (nn.Module): patch embedding layer
            act_layer (nn.Module): MLP activation layer
            block_fn (nn.Module): transformer block class
            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
        """
        super().__init__()
        norm_layer = partial(nn.LayerNorm, eps=1e-6)

        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
        self.num_tokens = 1
        self.n_blocks = depth
        self.num_heads = num_heads
        self.patch_size = patch_size
        self.num_register_tokens = num_register_tokens
        self.interpolate_antialias = interpolate_antialias
        self.interpolate_offset = interpolate_offset

        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
        assert num_register_tokens >= 0
        self.register_tokens = (
            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
        )

        if drop_path_uniform is True:
            dpr = [drop_path_rate] * depth
        else:
            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule

        if ffn_layer == "mlp":
            logger.info("using MLP layer as FFN")
            ffn_layer = Mlp
        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
            logger.info("using SwiGLU layer as FFN")
            ffn_layer = SwiGLUFFNFused
        elif ffn_layer == "identity":
            logger.info("using Identity layer as FFN")

            def f(*args, **kwargs):
                return nn.Identity()

            ffn_layer = f
        else:
            raise NotImplementedError

        blocks_list = [
            block_fn(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                proj_bias=proj_bias,
                ffn_bias=ffn_bias,
                drop_path=dpr[i],
                norm_layer=norm_layer,
                act_layer=act_layer,
                ffn_layer=ffn_layer,
                init_values=init_values,
            )
            for i in range(depth)
        ]
        if block_chunks > 0:
            self.chunked_blocks = True
            chunked_blocks = []
            chunksize = depth // block_chunks
            for i in range(0, depth, chunksize):
                # this is to keep the block index consistent if we chunk the block list
                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
        else:
            self.chunked_blocks = False
            self.blocks = nn.ModuleList(blocks_list)

        self.norm = norm_layer(embed_dim)
        self.head = nn.Identity()

        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))

        self.init_weights()

    def init_weights(self):
        trunc_normal_(self.pos_embed, std=0.02)
        nn.init.normal_(self.cls_token, std=1e-6)
        if self.register_tokens is not None:
            nn.init.normal_(self.register_tokens, std=1e-6)
        named_apply(init_weights_vit_timm, self)

    def interpolate_pos_encoding(self, x, w, h):
        previous_dtype = x.dtype
        npatch = x.shape[1] - 1
        N = self.pos_embed.shape[1] - 1
        if npatch == N and w == h:
            return self.pos_embed
        pos_embed = self.pos_embed.float()
        class_pos_embed = pos_embed[:, 0]
        patch_pos_embed = pos_embed[:, 1:]
        dim = x.shape[-1]
        w0 = w // self.patch_size
        h0 = h // self.patch_size
        # we add a small number to avoid floating point error in the interpolation
        # see discussion at https://github.com/facebookresearch/dino/issues/8
        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset

        sqrt_N = math.sqrt(N)
        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
            scale_factor=(sx, sy),
            mode="bicubic",
            antialias=self.interpolate_antialias,
        )

        assert int(w0) == patch_pos_embed.shape[-2]
        assert int(h0) == patch_pos_embed.shape[-1]
        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)

    def prepare_tokens_with_masks(self, x, masks=None):
        B, nc, w, h = x.shape
        x = self.patch_embed(x)
        if masks is not None:
            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)

        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
        x = x + self.interpolate_pos_encoding(x, w, h)

        if self.register_tokens is not None:
            x = torch.cat(
                (
                    x[:, :1],
                    self.register_tokens.expand(x.shape[0], -1, -1),
                    x[:, 1:],
                ),
                dim=1,
            )

        return x

    def forward_features_list(self, x_list, masks_list):
        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
        for blk in self.blocks:
            x = blk(x)

        all_x = x
        output = []
        for x, masks in zip(all_x, masks_list):
            x_norm = self.norm(x)
            output.append(
                {
                    "x_norm_clstoken": x_norm[:, 0],
                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
                    "x_prenorm": x,
                    "masks": masks,
                }
            )
        return output

    def forward_features(self, x, masks=None):
        fea_list = []
        counter = 0
        if isinstance(x, list):
            return self.forward_features_list(x, masks)

        x = self.prepare_tokens_with_masks(x, masks)
        fea_list.append(x[:, self.num_register_tokens + 1 :].permute(0, 2, 1))

        for blk in self.blocks:
            x = blk(x)
            counter += 1
            if counter % 3 == 0:
                fea_list.append(x[:, self.num_register_tokens + 1 :].permute(0, 2, 1))

        x_norm = self.norm(x)
        return fea_list, x_norm[:, 0]

    def _get_intermediate_layers_not_chunked(self, x, n=1):
        x = self.prepare_tokens_with_masks(x)
        # If n is an int, take the n last blocks. If it's a list, take them
        output, total_block_len = [], len(self.blocks)
        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
        for i, blk in enumerate(self.blocks):
            x = blk(x)
            if i in blocks_to_take:
                output.append(x)
        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
        return output

    def _get_intermediate_layers_chunked(self, x, n=1):
        x = self.prepare_tokens_with_masks(x)
        output, i, total_block_len = [], 0, len(self.blocks[-1])
        # If n is an int, take the n last blocks. If it's a list, take them
        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
        for block_chunk in self.blocks:
            for blk in block_chunk[i:]:  # Passing the nn.Identity()
                x = blk(x)
                if i in blocks_to_take:
                    output.append(x)
                i += 1
        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
        return output

    def get_intermediate_layers(
        self,
        x: torch.Tensor,
        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
        reshape: bool = False,
        return_class_token: bool = False,
        norm=True,
    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
        if self.chunked_blocks:
            outputs = self._get_intermediate_layers_chunked(x, n)
        else:
            outputs = self._get_intermediate_layers_not_chunked(x, n)
        if norm:
            outputs = [self.norm(out) for out in outputs]
        class_tokens = [out[:, 0] for out in outputs]
        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
        if reshape:
            B, _, w, h = x.shape
            outputs = [
                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
                for out in outputs
            ]
        if return_class_token:
            return tuple(zip(outputs, class_tokens))
        return tuple(outputs)

    def forward(self, *args, is_training=False, **kwargs):
        ret = self.forward_features(*args, **kwargs)
        if is_training:
            return ret
        else:
            return ret#self.head(ret["x_norm_clstoken"])


def init_weights_vit_timm(module: nn.Module, name: str = ""):
    """ViT weight initialization, original timm impl (for reproducibility)"""
    if isinstance(module, nn.Linear):
        trunc_normal_(module.weight, std=0.02)
        if module.bias is not None:
            nn.init.zeros_(module.bias)


def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
    model = DinoVisionTransformer(
        patch_size=patch_size,
        embed_dim=384,
        depth=12,
        num_heads=6,
        mlp_ratio=4,
        block_fn=partial(Block, attn_class=MemEffAttention),
        num_register_tokens=num_register_tokens,
        **kwargs,
    )
    return model

def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
    model = DinoVisionTransformer(
        patch_size=patch_size,
        embed_dim=1024,
        depth=24,
        num_heads=16,
        mlp_ratio=4,
        block_fn=partial(Block, attn_class=MemEffAttention),
        num_register_tokens=num_register_tokens,
        **kwargs,
    )
    return model


# net = vit_small(patch_size=14, img_size=518, block_chunks=0, init_values=1.0)
# prefile = torch.load('../weights/dinov2_vits14_pretrain.pth')
# net.load_state_dict(prefile, True)
# out = net(torch.rand(1, 3, 518, 518))
# print(out.shape)

# net = vit_large(patch_size=14, img_size=526, block_chunks=0, init_values=1.0, num_register_tokens=4)
# prefile = torch.load('../weights/dinov2_vitl14_reg4_pretrain.pth')
# net.load_state_dict(prefile, True)
# out = net(torch.rand(1, 3, 70, 70))
# print(out.shape)


================================================
FILE: ADD/th_utils/__init__.py
================================================
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

# empty


================================================
FILE: ADD/th_utils/custom_ops.py
================================================
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

import glob
import hashlib
import importlib
import os
import re
import shutil
import uuid

import torch
import torch.utils.cpp_extension
from torch.utils.file_baton import FileBaton

#----------------------------------------------------------------------------
# Global options.

verbosity = 'brief' # Verbosity level: 'none', 'brief', 'full'

#----------------------------------------------------------------------------
# Internal helper funcs.

def _find_compiler_bindir():
    patterns = [
        'C:/Program Files*/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64',
        'C:/Program Files*/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64',
        'C:/Program Files*/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64',
        'C:/Program Files*/Microsoft Visual Studio */vc/bin',
    ]
    for pattern in patterns:
        matches = sorted(glob.glob(pattern))
        if len(matches):
            return matches[-1]
    return None

#----------------------------------------------------------------------------

def _get_mangled_gpu_name():
    name = torch.cuda.get_device_name().lower()
    out = []
    for c in name:
        if re.match('[a-z0-9_-]+', c):
            out.append(c)
        else:
            out.append('-')
    return ''.join(out)

#----------------------------------------------------------------------------
# Main entry point for compiling and loading C++/CUDA plugins.

_cached_plugins = dict()

def get_plugin(module_name, sources, headers=None, source_dir=None, **build_kwargs):
    assert verbosity in ['none', 'brief', 'full']
    if headers is None:
        headers = []
    if source_dir is not None:
        sources = [os.path.join(source_dir, fname) for fname in sources]
        headers = [os.path.join(source_dir, fname) for fname in headers]

    # Already cached?
    if module_name in _cached_plugins:
        return _cached_plugins[module_name]

    # Print status.
    if verbosity == 'full':
        print(f'Setting up PyTorch plugin "{module_name}"...')
    elif verbosity == 'brief':
        print(f'Setting up PyTorch plugin "{module_name}"... ', end='', flush=True)
    verbose_build = (verbosity == 'full')

    # Compile and load.
    try: # pylint: disable=too-many-nested-blocks
        # Make sure we can find the necessary compiler binaries.
        if os.name == 'nt' and os.system("where cl.exe >nul 2>nul") != 0:
            compiler_bindir = _find_compiler_bindir()
            if compiler_bindir is None:
                raise RuntimeError(f'Could not find MSVC/GCC/CLANG installation on this computer. Check _find_compiler_bindir() in "{__file__}".')
            os.environ['PATH'] += ';' + compiler_bindir

        # Some containers set TORCH_CUDA_ARCH_LIST to a list that can either
        # break the build or unnecessarily restrict what's available to nvcc.
        # Unset it to let nvcc decide based on what's available on the
        # machine.
        os.environ['TORCH_CUDA_ARCH_LIST'] = ''

        # Incremental build md5sum trickery.  Copies all the input source files
        # into a cached build directory under a combined md5 digest of the input
        # source files.  Copying is done only if the combined digest has changed.
        # This keeps input file timestamps and filenames the same as in previous
        # extension builds, allowing for fast incremental rebuilds.
        #
        # This optimization is done only in case all the source files reside in
        # a single directory (just for simplicity) and if the TORCH_EXTENSIONS_DIR
        # environment variable is set (we take this as a signal that the user
        # actually cares about this.)
        #
        # EDIT: We now do it regardless of TORCH_EXTENSIOS_DIR, in order to work
        # around the *.cu dependency bug in ninja config.
        #
        all_source_files = sorted(sources + headers)
        all_source_dirs = set(os.path.dirname(fname) for fname in all_source_files)
        if len(all_source_dirs) == 1: # and ('TORCH_EXTENSIONS_DIR' in os.environ):

            # Compute combined hash digest for all source files.
            hash_md5 = hashlib.md5()
            for src in all_source_files:
                with open(src, 'rb') as f:
                    hash_md5.update(f.read())

            # Select cached build directory name.
            source_digest = hash_md5.hexdigest()
            build_top_dir = torch.utils.cpp_extension._get_build_directory(module_name, verbose=verbose_build) # pylint: disable=protected-access
            cached_build_dir = os.path.join(build_top_dir, f'{source_digest}-{_get_mangled_gpu_name()}')

            if not os.path.isdir(cached_build_dir):
                tmpdir = f'{build_top_dir}/srctmp-{uuid.uuid4().hex}'
                os.makedirs(tmpdir)
                for src in all_source_files:
                    shutil.copyfile(src, os.path.join(tmpdir, os.path.basename(src)))
                try:
                    os.replace(tmpdir, cached_build_dir) # atomic
                except OSError:
                    # source directory already exists, delete tmpdir and its contents.
                    shutil.rmtree(tmpdir)
                    if not os.path.isdir(cached_build_dir): raise

            # Compile.
            cached_sources = [os.path.join(cached_build_dir, os.path.basename(fname)) for fname in sources]
            torch.utils.cpp_extension.load(name=module_name, build_directory=cached_build_dir,
                verbose=verbose_build, sources=cached_sources, **build_kwargs)
        else:
            torch.utils.cpp_extension.load(name=module_name, verbose=verbose_build, sources=sources, **build_kwargs)

        # Load.
        module = importlib.import_module(module_name)

    except:
        if verbosity == 'brief':
            print('Failed!')
        raise

    # Print status and add to cache dict.
    if verbosity == 'full':
        print(f'Done setting up PyTorch plugin "{module_name}".')
    elif verbosity == 'brief':
        print('Done.')
    _cached_plugins[module_name] = module
    return module

#----------------------------------------------------------------------------


================================================
FILE: ADD/th_utils/misc.py
================================================
# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

import re
import contextlib
import numpy as np
import torch
import warnings
import ADD.dnnlib as dnnlib

#----------------------------------------------------------------------------
# Cached construction of constant tensors. Avoids CPU=>GPU copy when the
# same constant is used multiple times.

_constant_cache = dict()

def constant(value, shape=None, dtype=None, device=None, memory_format=None):
    value = np.asarray(value)
    if shape is not None:
        shape = tuple(shape)
    if dtype is None:
        dtype = torch.get_default_dtype()
    if device is None:
        device = torch.device('cpu')
    if memory_format is None:
        memory_format = torch.contiguous_format

    key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format)
    tensor = _constant_cache.get(key, None)
    if tensor is None:
        tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device)
        if shape is not None:
            tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape))
        tensor = tensor.contiguous(memory_format=memory_format)
        _constant_cache[key] = tensor
    return tensor

#----------------------------------------------------------------------------
# Replace NaN/Inf with specified numerical values.

try:
    nan_to_num = torch.nan_to_num # 1.8.0a0
except AttributeError:
    def nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None): # pylint: disable=redefined-builtin
        assert isinstance(input, torch.Tensor)
        if posinf is None:
            posinf = torch.finfo(input.dtype).max
        if neginf is None:
            neginf = torch.finfo(input.dtype).min
        assert nan == 0
        return torch.clamp(input.unsqueeze(0).nansum(0), min=neginf, max=posinf, out=out)

#----------------------------------------------------------------------------
# Symbolic assert.

try:
    symbolic_assert = torch._assert # 1.8.0a0 # pylint: disable=protected-access
except AttributeError:
    symbolic_assert = torch.Assert # 1.7.0

#----------------------------------------------------------------------------
# Context manager to temporarily suppress known warnings in torch.jit.trace().
# Note: Cannot use catch_warnings because of https://bugs.python.org/issue29672

@contextlib.contextmanager
def suppress_tracer_warnings():
    flt = ('ignore', None, torch.jit.TracerWarning, None, 0)
    warnings.filters.insert(0, flt)
    yield
    warnings.filters.remove(flt)

#----------------------------------------------------------------------------
# Assert that the shape of a tensor matches the given list of integers.
# None indicates that the size of a dimension is allowed to vary.
# Performs symbolic assertion when used in torch.jit.trace().

def assert_shape(tensor, ref_shape):
    if tensor.ndim != len(ref_shape):
        raise AssertionError(f'Wrong number of dimensions: got {tensor.ndim}, expected {len(ref_shape)}')
    for idx, (size, ref_size) in enumerate(zip(tensor.shape, ref_shape)):
        if ref_size is None:
            pass
        elif isinstance(ref_size, torch.Tensor):
            with suppress_tracer_warnings(): # as_tensor results are registered as constants
                symbolic_assert(torch.equal(torch.as_tensor(size), ref_size), f'Wrong size for dimension {idx}')
        elif isinstance(size, torch.Tensor):
            with suppress_tracer_warnings(): # as_tensor results are registered as constants
                symbolic_assert(torch.equal(size, torch.as_tensor(ref_size)), f'Wrong size for dimension {idx}: expected {ref_size}')
        elif size != ref_size:
            raise AssertionError(f'Wrong size for dimension {idx}: got {size}, expected {ref_size}')

#----------------------------------------------------------------------------
# Function decorator that calls torch.autograd.profiler.record_function().

def profiled_function(fn):
    def decorator(*args, **kwargs):
        with torch.autograd.profiler.record_function(fn.__name__):
            return fn(*args, **kwargs)
    decorator.__name__ = fn.__name__
    return decorator

#----------------------------------------------------------------------------
# Sampler for torch.utils.data.DataLoader that loops over the dataset
# indefinitely, shuffling items as it goes.

class InfiniteSampler(torch.utils.data.Sampler):
    def __init__(self, dataset, rank=0, num_replicas=1, shuffle=True, seed=0, window_size=0.5):
        assert len(dataset) > 0
        assert num_replicas > 0
        assert 0 <= rank < num_replicas
        assert 0 <= window_size <= 1
        super().__init__(dataset)
        self.dataset = dataset
        self.rank = rank
        self.num_replicas = num_replicas
        self.shuffle = shuffle
        self.seed = seed
        self.window_size = window_size

    def __iter__(self):
        order = np.arange(len(self.dataset))
        rnd = None
        window = 0
        if self.shuffle:
            rnd = np.random.RandomState(self.seed)
            rnd.shuffle(order)
            window = int(np.rint(order.size * self.window_size))

        idx = 0
        while True:
            i = idx % order.size
            if idx % self.num_replicas == self.rank:
                yield order[i]
            if window >= 2:
                j = (i - rnd.randint(window)) % order.size
                order[i], order[j] = order[j], order[i]
            idx += 1

#----------------------------------------------------------------------------
# Utilities for operating with torch.nn.Module parameters and buffers.
def spectral_to_cpu(model: torch.nn.Module):
    def wrapped_in_spectral(m): return hasattr(m, 'weight_v')
    children = get_children(model)
    for child in children:
        if wrapped_in_spectral(child):
            child.weight = child.weight.cpu()
    return model

def get_children(model: torch.nn.Module):
    children = list(model.children())
    flatt_children = []
    if children == []:
        return model
    else:
       for child in children:
            try:
                flatt_children.extend(get_children(child))
            except TypeError:
                flatt_children.append(get_children(child))
    return flatt_children

def params_and_buffers(module):
    assert isinstance(module, torch.nn.Module)
    return list(module.parameters()) + list(module.buffers())

def named_params_and_buffers(module):
    assert isinstance(module, torch.nn.Module)
    return list(module.named_parameters()) + list(module.named_buffers())

def copy_params_and_buffers(src_module, dst_module, require_all=False):
    assert isinstance(src_module, torch.nn.Module)
    assert isinstance(dst_module, torch.nn.Module)
    src_tensors = dict(named_params_and_buffers(src_module))
    for name, tensor in named_params_and_buffers(dst_module):
        assert (name in src_tensors) or (not require_all)
        if name in src_tensors:
            tensor.copy_(src_tensors[name].detach()).requires_grad_(tensor.requires_grad)

#----------------------------------------------------------------------------
# Context manager for easily enabling/disabling DistributedDataParallel
# synchronization.

@contextlib.contextmanager
def ddp_sync(module, sync):
    assert isinstance(module, torch.nn.Module)
    if sync or not isinstance(module, torch.nn.parallel.DistributedDataParallel):
        yield
    else:
        with module.no_sync():
            yield

#----------------------------------------------------------------------------
# Check DistributedDataParallel consistency across processes.

def check_ddp_consistency(module, ignore_regex=None):
    assert isinstance(module, torch.nn.Module)
    for name, tensor in named_params_and_buffers(module):
        fullname = type(module).__name__ + '.' + name
        if ignore_regex is not None and re.fullmatch(ignore_regex, fullname):
            continue
        tensor = tensor.detach()
        if tensor.is_floating_point():
            tensor = nan_to_num(tensor)
        other = tensor.clone()
        torch.distributed.broadcast(tensor=other, src=0)
        assert (tensor == other).all(), fullname

#----------------------------------------------------------------------------
# Print summary table of module hierarchy.

def print_module_summary(module, inputs, max_nesting=3, skip_redundant=True):
    assert isinstance(module, torch.nn.Module)
    assert not isinstance(module, torch.jit.ScriptModule)
    assert isinstance(inputs, (tuple, list))

    # Register hooks.
    entries = []
    nesting = [0]
    def pre_hook(_mod, _inputs):
        nesting[0] += 1
    def post_hook(mod, _inputs, outputs):
        nesting[0] -= 1
        if nesting[0] <= max_nesting:
            outputs = list(outputs) if isinstance(outputs, (tuple, list)) else [outputs]
            outputs = [t for t in outputs if isinstance(t, torch.Tensor)]
            entries.append(dnnlib.EasyDict(mod=mod, outputs=outputs))
    hooks = [mod.register_forward_pre_hook(pre_hook) for mod in module.modules()]
    hooks += [mod.register_forward_hook(post_hook) for mod in module.modules()]

    # Run module.
    outputs = module(*inputs)
    for hook in hooks:
        hook.remove()

    # Identify unique outputs, parameters, and buffers.
    tensors_seen = set()
    for e in entries:
        e.unique_params = [t for t in e.mod.parameters() if id(t) not in tensors_seen]
        e.unique_buffers = [t for t in e.mod.buffers() if id(t) not in tensors_seen]
        e.unique_outputs = [t for t in e.outputs if id(t) not in tensors_seen]
        tensors_seen |= {id(t) for t in e.unique_params + e.unique_buffers + e.unique_outputs}

    # Filter out redundant entries.
    if skip_redundant:
        entries = [e for e in entries if len(e.unique_params) or len(e.unique_buffers) or len(e.unique_outputs)]

    # Construct table.
    rows = [[type(module).__name__, 'Parameters', 'Buffers', 'Output shape', 'Datatype']]
    rows += [['---'] * len(rows[0])]
    param_total = 0
    buffer_total = 0
    submodule_names = {mod: name for name, mod in module.named_modules()}
    for e in entries:
        name = '<top-level>' if e.mod is module else submodule_names[e.mod]
        param_size = sum(t.numel() for t in e.unique_params)
        buffer_size = sum(t.numel() for t in e.unique_buffers)
        output_shapes = [str(list(t.shape)) for t in e.outputs]
        output_dtypes = [str(t.dtype).split('.')[-1] for t in e.outputs]
        rows += [[
            name + (':0' if len(e.outputs) >= 2 else ''),
            str(param_size) if param_size else '-',
            str(buffer_size) if buffer_size else '-',
            (output_shapes + ['-'])[0],
            (output_dtypes + ['-'])[0],
        ]]
        for idx in range(1, len(e.outputs)):
            rows += [[name + f':{idx}', '-', '-', output_shapes[idx], output_dtypes[idx]]]
        param_total += param_size
        buffer_total += buffer_size
    rows += [['---'] * len(rows[0])]
    rows += [['Total', str(param_total), str(buffer_total), '-', '-']]

    # Print table.
    widths = [max(len(cell) for cell in column) for column in zip(*rows)]
    print()
    for row in rows:
        print('  '.join(cell + ' ' * (width - len(cell)) for cell, width in zip(row, widths)))
    print()
    return outputs


================================================
FILE: ADD/th_utils/ops/__init__.py
================================================
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

# empty


================================================
FILE: ADD/th_utils/ops/bias_act.cpp
================================================
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "bias_act.h"

//------------------------------------------------------------------------

static bool has_same_layout(torch::Tensor x, torch::Tensor y)
{
    if (x.dim() != y.dim())
        return false;
    for (int64_t i = 0; i < x.dim(); i++)
    {
        if (x.size(i) != y.size(i))
            return false;
        if (x.size(i) >= 2 && x.stride(i) != y.stride(i))
            return false;
    }
    return true;
}

//------------------------------------------------------------------------

static torch::Tensor bias_act(torch::Tensor x, torch::Tensor b, torch::Tensor xref, torch::Tensor yref, torch::Tensor dy, int grad, int dim, int act, float alpha, float gain, float clamp)
{
    // Validate arguments.
    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
    TORCH_CHECK(b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()), "b must have the same dtype and device as x");
    TORCH_CHECK(xref.numel() == 0 || (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() && xref.device() == x.device()), "xref must have the same shape, dtype, and device as x");
    TORCH_CHECK(yref.numel() == 0 || (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() && yref.device() == x.device()), "yref must have the same shape, dtype, and device as x");
    TORCH_CHECK(dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() && dy.device() == x.device()), "dy must have the same dtype and device as x");
    TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
    TORCH_CHECK(b.dim() == 1, "b must have rank 1");
    TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()), "dim is out of bounds");
    TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim), "b has wrong number of elements");
    TORCH_CHECK(grad >= 0, "grad must be non-negative");

    // Validate layout.
    TORCH_CHECK(x.is_non_overlapping_and_dense(), "x must be non-overlapping and dense");
    TORCH_CHECK(b.is_contiguous(), "b must be contiguous");
    TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x), "xref must have the same layout as x");
    TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x), "yref must have the same layout as x");
    TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x), "dy must have the same layout as x");

    // Create output tensor.
    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
    torch::Tensor y = torch::empty_like(x);
    TORCH_CHECK(has_same_layout(y, x), "y must have the same layout as x");

    // Initialize CUDA kernel parameters.
    bias_act_kernel_params p;
    p.x     = x.data_ptr();
    p.b     = (b.numel()) ? b.data_ptr() : NULL;
    p.xref  = (xref.numel()) ? xref.data_ptr() : NULL;
    p.yref  = (yref.numel()) ? yref.data_ptr() : NULL;
    p.dy    = (dy.numel()) ? dy.data_ptr() : NULL;
    p.y     = y.data_ptr();
    p.grad  = grad;
    p.act   = act;
    p.alpha = alpha;
    p.gain  = gain;
    p.clamp = clamp;
    p.sizeX = (int)x.numel();
    p.sizeB = (int)b.numel();
    p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;

    // Choose CUDA kernel.
    void* kernel;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&]
    {
        kernel = choose_bias_act_kernel<scalar_t>(p);
    });
    TORCH_CHECK(kernel, "no CUDA kernel found for the specified activation func");

    // Launch CUDA kernel.
    p.loopX = 4;
    int blockSize = 4 * 32;
    int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
    void* args[] = {&p};
    AT_CUDA_CHECK(cudaLaunchKernel(kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream()));
    return y;
}

//------------------------------------------------------------------------

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
    m.def("bias_act", &bias_act);
}

//------------------------------------------------------------------------


================================================
FILE: ADD/th_utils/ops/bias_act.cu
================================================
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#include <c10/util/Half.h>
#include "bias_act.h"

//------------------------------------------------------------------------
// Helpers.

template <class T> struct InternalType;
template <> struct InternalType<double>     { typedef double scalar_t; };
template <> struct InternalType<float>      { typedef float  scalar_t; };
template <> struct InternalType<c10::Half>  { typedef float  scalar_t; };

//------------------------------------------------------------------------
// CUDA kernel.

template <class T, int A>
__global__ void bias_act_kernel(bias_act_kernel_params p)
{
    typedef typename InternalType<T>::scalar_t scalar_t;
    int G                 = p.grad;
    scalar_t alpha        = (scalar_t)p.alpha;
    scalar_t gain         = (scalar_t)p.gain;
    scalar_t clamp        = (scalar_t)p.clamp;
    scalar_t one          = (scalar_t)1;
    scalar_t two          = (scalar_t)2;
    scalar_t expRange     = (scalar_t)80;
    scalar_t halfExpRange = (scalar_t)40;
    scalar_t seluScale    = (scalar_t)1.0507009873554804934193349852946;
    scalar_t seluAlpha    = (scalar_t)1.6732632423543772848170429916717;

    // Loop over elements.
    int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
    for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX; loopIdx++, xi += blockDim.x)
    {
        // Load.
        scalar_t x = (scalar_t)((const T*)p.x)[xi];
        scalar_t b = (p.b) ? (scalar_t)((const T*)p.b)[(xi / p.stepB) % p.sizeB] : 0;
        scalar_t xref = (p.xref) ? (scalar_t)((const T*)p.xref)[xi] : 0;
        scalar_t yref = (p.yref) ? (scalar_t)((const T*)p.yref)[xi] : 0;
        scalar_t dy = (p.dy) ? (scalar_t)((const T*)p.dy)[xi] : one;
        scalar_t yy = (gain != 0) ? yref / gain : 0;
        scalar_t y = 0;

        // Apply bias.
        ((G == 0) ? x : xref) += b;

        // linear
        if (A == 1)
        {
            if (G == 0) y = x;
            if (G == 1) y = x;
        }

        // relu
        if (A == 2)
        {
            if (G == 0) y = (x > 0) ? x : 0;
            if (G == 1) y = (yy > 0) ? x : 0;
        }

        // lrelu
        if (A == 3)
        {
            if (G == 0) y = (x > 0) ? x : x * alpha;
            if (G == 1) y = (yy > 0) ? x : x * alpha;
        }

        // tanh
        if (A == 4)
        {
            if (G == 0) { scalar_t c = exp(x); scalar_t d = one / c; y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d); }
            if (G == 1) y = x * (one - yy * yy);
            if (G == 2) y = x * (one - yy * yy) * (-two * yy);
        }

        // sigmoid
        if (A == 5)
        {
            if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);
            if (G == 1) y = x * yy * (one - yy);
            if (G == 2) y = x * yy * (one - yy) * (one - two * yy);
        }

        // elu
        if (A == 6)
        {
            if (G == 0) y = (x >= 0) ? x : exp(x) - one;
            if (G == 1) y = (yy >= 0) ? x : x * (yy + one);
            if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);
        }

        // selu
        if (A == 7)
        {
            if (G == 0) y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);
            if (G == 1) y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);
            if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);
        }

        // softplus
        if (A == 8)
        {
            if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);
            if (G == 1) y = x * (one - exp(-yy));
            if (G == 2) { scalar_t c = exp(-yy); y = x * c * (one - c); }
        }

        // swish
        if (A == 9)
        {
            if (G == 0)
                y = (x < -expRange) ? 0 : x / (exp(-x) + one);
            else
            {
                scalar_t c = exp(xref);
                scalar_t d = c + one;
                if (G == 1)
                    y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);
                else
                    y = (xref > halfExpRange) ? 0 : x * c * (xref * (two - d) + two * d) / (d * d * d);
                yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;
            }
        }

        // Apply gain.
        y *= gain * dy;

        // Clamp.
        if (clamp >= 0)
        {
            if (G == 0)
                y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;
            else
                y = (yref > -clamp & yref < clamp) ? y : 0;
        }

        // Store.
        ((T*)p.y)[xi] = (T)y;
    }
}

//------------------------------------------------------------------------
// CUDA kernel selection.

template <class T> void* choose_bias_act_kernel(const bias_act_kernel_params& p)
{
    if (p.act == 1) return (void*)bias_act_kernel<T, 1>;
    if (p.act == 2) return (void*)bias_act_kernel<T, 2>;
    if (p.act == 3) return (void*)bias_act_kernel<T, 3>;
    if (p.act == 4) return (void*)bias_act_kernel<T, 4>;
    if (p.act == 5) return (void*)bias_act_kernel<T, 5>;
    if (p.act == 6) return (void*)bias_act_kernel<T, 6>;
    if (p.act == 7) return (void*)bias_act_kernel<T, 7>;
    if (p.act == 8) return (void*)bias_act_kernel<T, 8>;
    if (p.act == 9) return (void*)bias_act_kernel<T, 9>;
    return NULL;
}

//------------------------------------------------------------------------
// Template specializations.

template void* choose_bias_act_kernel<double>       (const bias_act_kernel_params& p);
template void* choose_bias_act_kernel<float>        (const bias_act_kernel_params& p);
template void* choose_bias_act_kernel<c10::Half>    (const bias_act_kernel_params& p);

//------------------------------------------------------------------------


================================================
FILE: ADD/th_utils/ops/bias_act.h
================================================
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

//------------------------------------------------------------------------
// CUDA kernel parameters.

struct bias_act_kernel_params
{
    const void* x;      // [sizeX]
    const void* b;      // [sizeB] or NULL
    const void* xref;   // [sizeX] or NULL
    const void* yref;   // [sizeX] or NULL
    const void* dy;     // [sizeX] or NULL
    void*       y;      // [sizeX]

    int         grad;
    int         act;
    float       alpha;
    float       gain;
    float       clamp;

    int         sizeX;
    int         sizeB;
    int         stepB;
    int         loopX;
};

//------------------------------------------------------------------------
// CUDA kernel selection.

template <class T> void* choose_bias_act_kernel(const bias_act_kernel_params& p);

//------------------------------------------------------------------------


================================================
FILE: ADD/th_utils/ops/bias_act.py
================================================
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

"""Custom PyTorch ops for efficient bias and activation."""

import os
import numpy as np
import torch
import ADD.dnnlib as dnnlib

from .. import custom_ops
from .. import misc

#----------------------------------------------------------------------------

activation_funcs = {
    'linear':   dnnlib.EasyDict(func=lambda x, **_:         x,                                          def_alpha=0,    def_gain=1,             cuda_idx=1, ref='',  has_2nd_grad=False),
    'relu':     dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.relu(x),                def_alpha=0,    def_gain=np.sqrt(2),    cuda_idx=2, ref='y', has_2nd_grad=False),
    'lrelu':    dnnlib.EasyDict(func=lambda x, alpha, **_:  torch.nn.functional.leaky_relu(x, alpha),   def_alpha=0.2,  def_gain=np.sqrt(2),    cuda_idx=3, ref='y', has_2nd_grad=False),
    'tanh':     dnnlib.EasyDict(func=lambda x, **_:         torch.tanh(x),                              def_alpha=0,    def_gain=1,             cuda_idx=4, ref='y', has_2nd_grad=True),
    'sigmoid':  dnnlib.EasyDict(func=lambda x, **_:         torch.sigmoid(x),                           def_alpha=0,    def_gain=1,             cuda_idx=5, ref='y', has_2nd_grad=True),
    'elu':      dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.elu(x),                 def_alpha=0,    def_gain=1,             cuda_idx=6, ref='y', has_2nd_grad=True),
    'selu':     dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.selu(x),                def_alpha=0,    def_gain=1,             cuda_idx=7, ref='y', has_2nd_grad=True),
    'softplus': dnnlib.EasyDict(func=lambda x, **_:         torch.nn.functional.softplus(x),            def_alpha=0,    def_gain=1,             cuda_idx=8, ref='y', has_2nd_grad=True),
    'swish':    dnnlib.EasyDict(func=lambda x, **_:         torch.sigmoid(x) * x,                       def_alpha=0,    def_gain=np.sqrt(2),    cuda_idx=9, ref='x', has_2nd_grad=True),
}

#----------------------------------------------------------------------------

_plugin = None
_null_tensor = torch.empty([0])

def _init():
    global _plugin
    if _plugin is None:
        _plugin = custom_ops.get_plugin(
            module_name='bias_act_plugin',
            sources=['bias_act.cpp', 'bias_act.cu'],
            headers=['bias_act.h'],
            source_dir=os.path.dirname(__file__),
            extra_cuda_cflags=['--use_fast_math', '--allow-unsupported-compiler'],
        )
    return True

#----------------------------------------------------------------------------

def bias_act(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None, impl='cuda'):
    r"""Fused bias and activation function.

    Adds bias `b` to activation tensor `x`, evaluates activation function `act`,
    and scales the result by `gain`. Each of the steps is optional. In most cases,
    the fused op is considerably more efficient than performing the same calculation
    using standard PyTorch ops. It supports first and second order gradients,
    but not third order gradients.

    Args:
        x:      Input activation tensor. Can be of any shape.
        b:      Bias vector, or `None` to disable. Must be a 1D tensor of the same type
                as `x`. The shape must be known, and it must match the dimension of `x`
                corresponding to `dim`.
        dim:    The dimension in `x` corresponding to the elements of `b`.
                The value of `dim` is ignored if `b` is not specified.
        act:    Name of the activation function to evaluate, or `"linear"` to disable.
                Can be e.g. `"relu"`, `"lrelu"`, `"tanh"`, `"sigmoid"`, `"swish"`, etc.
                See `activation_funcs` for a full list. `None` is not allowed.
        alpha:  Shape parameter for the activation function, or `None` to use the default.
        gain:   Scaling factor for the output tensor, or `None` to use default.
                See `activation_funcs` for the default scaling of each activation function.
                If unsure, consider specifying 1.
        clamp:  Clamp the output values to `[-clamp, +clamp]`, or `None` to disable
                the clamping (default).
        impl:   Name of the implementation to use. Can be `"ref"` or `"cuda"` (default).

    Returns:
        Tensor of the same shape and datatype as `x`.
    """
    assert isinstance(x, torch.Tensor)
    assert impl in ['ref', 'cuda']
    if impl == 'cuda' and x.device.type == 'cuda' and _init():
        return _bias_act_cuda(dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp).apply(x, b)
    return _bias_act_ref(x=x, b=b, dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp)

#----------------------------------------------------------------------------

@misc.profiled_function
def _bias_act_ref(x, b=None, dim=1, act='linear', alpha=None, gain=None, clamp=None):
    """Slow reference implementation of `bias_act()` using standard TensorFlow ops.
    """
    assert isinstance(x, torch.Tensor)
    assert clamp is None or clamp >= 0
    spec = activation_funcs[act]
    alpha = float(alpha if alpha is not None else spec.def_alpha)
    gain = float(gain if gain is not None else spec.def_gain)
    clamp = float(clamp if clamp is not None else -1)

    # Add bias.
    if b is not None:
        assert isinstance(b, torch.Tensor) and b.ndim == 1
        assert 0 <= dim < x.ndim
        assert b.shape[0] == x.shape[dim]
        x = x + b.reshape([-1 if i == dim else 1 for i in range(x.ndim)])

    # Evaluate activation function.
    alpha = float(alpha)
    x = spec.func(x, alpha=alpha)

    # Scale by gain.
    gain = float(gain)
    if gain != 1:
        x = x * gain

    # Clamp.
    if clamp >= 0:
        x = x.clamp(-clamp, clamp) # pylint: disable=invalid-unary-operand-type
    return x

#----------------------------------------------------------------------------

_bias_act_cuda_cache = dict()

def _bias_act_cuda(dim=1, act='linear', alpha=None, gain=None, clamp=None):
    """Fast CUDA implementation of `bias_act()` using custom ops.
    """
    # Parse arguments.
    assert clamp is None or clamp >= 0
    spec = activation_funcs[act]
    alpha = float(alpha if alpha is not None else spec.def_alpha)
    gain = float(gain if gain is not None else spec.def_gain)
    clamp = float(clamp if clamp is not None else -1)

    # Lookup from cache.
    key = (dim, act, alpha, gain, clamp)
    if key in _bias_act_cuda_cache:
        return _bias_act_cuda_cache[key]

    # Forward op.
    class BiasActCuda(torch.autograd.Function):
        @staticmethod
        def forward(ctx, x, b): # pylint: disable=arguments-differ
            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride(1) == 1 else torch.contiguous_format
            x = x.contiguous(memory_format=ctx.memory_format)
            b = b.contiguous() if b is not None else _null_tensor
            y = x
            if act != 'linear' or gain != 1 or clamp >= 0 or b is not _null_tensor:
                y = _plugin.bias_act(x, b, _null_tensor, _null_tensor, _null_tensor, 0, dim, spec.cuda_idx, alpha, gain, clamp)
            ctx.save_for_backward(
                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor,
                b if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor,
                y if 'y' in spec.ref else _null_tensor)
            return y

        @staticmethod
        def backward(ctx, dy): # pylint: disable=arguments-differ
            dy = dy.contiguous(memory_format=ctx.memory_format)
            x, b, y = ctx.saved_tensors
            dx = None
            db = None

            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
                dx = dy
                if act != 'linear' or gain != 1 or clamp >= 0:
                    dx = BiasActCudaGrad.apply(dy, x, b, y)

            if ctx.needs_input_grad[1]:
                db = dx.sum([i for i in range(dx.ndim) if i != dim])

            return dx, db

    # Backward op.
    class BiasActCudaGrad(torch.autograd.Function):
        @staticmethod
        def forward(ctx, dy, x, b, y): # pylint: disable=arguments-differ
            ctx.memory_format = torch.channels_last if dy.ndim > 2 and dy.stride(1) == 1 else torch.contiguous_format
            dx = _plugin.bias_act(dy, b, x, y, _null_tensor, 1, dim, spec.cuda_idx, alpha, gain, clamp)
            ctx.save_for_backward(
                dy if spec.has_2nd_grad else _null_tensor,
                x, b, y)
            return dx

        @staticmethod
        def backward(ctx, d_dx): # pylint: disable=arguments-differ
            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)
            dy, x, b, y = ctx.saved_tensors
            d_dy = None
            d_x = None
            d_b = None
            d_y = None

            if ctx.needs_input_grad[0]:
                d_dy = BiasActCudaGrad.apply(d_dx, x, b, y)

            if spec.has_2nd_grad and (ctx.needs_input_grad[1] or ctx.needs_input_grad[2]):
                d_x = _plugin.bias_act(d_dx, b, x, y, dy, 2, dim, spec.cuda_idx, alpha, gain, clamp)

            if spec.has_2nd_grad and ctx.needs_input_grad[2]:
                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])

            return d_dy, d_x, d_b, d_y

    # Add to cache.
    _bias_act_cuda_cache[key] = BiasActCuda
    return BiasActCuda

#----------------------------------------------------------------------------


================================================
FILE: ADD/th_utils/ops/conv2d_gradfix.py
================================================
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

"""Custom replacement for `torch.nn.functional.conv2d` that supports
arbitrarily high order gradients with zero performance penalty."""

import contextlib
import torch
from pkg_resources import parse_version

# pylint: disable=redefined-builtin
# pylint: disable=arguments-differ
# pylint: disable=protected-access

#----------------------------------------------------------------------------

enabled = False                     # Enable the custom op by setting this to true.
weight_gradients_disabled = False   # Forcefully disable computation of gradients with respect to the weights.
_use_pytorch_1_11_api = parse_version(torch.__version__) >= parse_version('1.11.0a') # Allow prerelease builds of 1.11

@contextlib.contextmanager
def no_weight_gradients(disable=True):
    global weight_gradients_disabled
    old = weight_gradients_disabled
    if disable:
        weight_gradients_disabled = True
    yield
    weight_gradients_disabled = old

#----------------------------------------------------------------------------

def conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
    if _should_use_custom_op(input):
        return _conv2d_gradfix(transpose=False, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=0, dilation=dilation, groups=groups).apply(input, weight, bias)
    return torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)

def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, output_padding=0, groups=1, dilation=1):
    if _should_use_custom_op(input):
        return _conv2d_gradfix(transpose=True, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation).apply(input, weight, bias)
    return torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation)

#----------------------------------------------------------------------------

def _should_use_custom_op(input):
    assert isinstance(input, torch.Tensor)
    if (not enabled) or (not torch.backends.cudnn.enabled):
        return False
    if _use_pytorch_1_11_api:
        # The work-around code doesn't work on PyTorch 1.11.0 onwards
        return False
    if input.device.type != 'cuda':
        return False
    return True

def _tuple_of_ints(xs, ndim):
    xs = tuple(xs) if isinstance(xs, (tuple, list)) else (xs,) * ndim
    assert len(xs) == ndim
    assert all(isinstance(x, int) for x in xs)
    return xs

#----------------------------------------------------------------------------

_conv2d_gradfix_cache = dict()
_null_tensor = torch.empty([0])

def _conv2d_gradfix(transpose, weight_shape, stride, padding, output_padding, dilation, groups):
    # Parse arguments.
    ndim = 2
    weight_shape = tuple(weight_shape)
    stride = _tuple_of_ints(stride, ndim)
    padding = _tuple_of_ints(padding, ndim)
    output_padding = _tuple_of_ints(output_padding, ndim)
    dilation = _tuple_of_ints(dilation, ndim)

    # Lookup from cache.
    key = (transpose, weight_shape, stride, padding, output_padding, dilation, groups)
    if key in _conv2d_gradfix_cache:
        return _conv2d_gradfix_cache[key]

    # Validate arguments.
    assert groups >= 1
    assert len(weight_shape) == ndim + 2
    assert all(stride[i] >= 1 for i in range(ndim))
    assert all(padding[i] >= 0 for i in range(ndim))
    assert all(dilation[i] >= 0 for i in range(ndim))
    if not transpose:
        assert all(output_padding[i] == 0 for i in range(ndim))
    else: # transpose
        assert all(0 <= output_padding[i] < max(stride[i], dilation[i]) for i in range(ndim))

    # Helpers.
    common_kwargs = dict(stride=stride, padding=padding, dilation=dilation, groups=groups)
    def calc_output_padding(input_shape, output_shape):
        if transpose:
            return [0, 0]
        return [
            input_shape[i + 2]
            - (output_shape[i + 2] - 1) * stride[i]
            - (1 - 2 * padding[i])
            - dilation[i] * (weight_shape[i + 2] - 1)
            for i in range(ndim)
        ]

    # Forward & backward.
    class Conv2d(torch.autograd.Function):
        @staticmethod
        def forward(ctx, input, weight, bias):
            assert weight.shape == weight_shape
            ctx.save_for_backward(
                input if weight.requires_grad else _null_tensor,
                weight if input.requires_grad else _null_tensor,
            )
            ctx.input_shape = input.shape

            # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).
            if weight_shape[2:] == stride == dilation == (1, 1) and padding == (0, 0) and torch.cuda.get_device_capability(input.device) < (8, 0):
                a = weight.reshape(groups, weight_shape[0] // groups, weight_shape[1])
                b = input.reshape(input.shape[0], groups, input.shape[1] // groups, -1)
                c = (a.transpose(1, 2) if transpose else a) @ b.permute(1, 2, 0, 3).flatten(2)
                c = c.reshape(-1, input.shape[0], *input.shape[2:]).transpose(0, 1)
                c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(2).unsqueeze(3)
                return c.contiguous(memory_format=(torch.channels_last if input.stride(1) == 1 else torch.contiguous_format))

            # General case => cuDNN.
            if transpose:
                return torch.nn.functional.conv_transpose2d(input=input, weight=weight, bias=bias, output_padding=output_padding, **common_kwargs)
            return torch.nn.functional.conv2d(input=input, weight=weight, bias=bias, **common_kwargs)

        @staticmethod
        def backward(ctx, grad_output):
            input, weight = ctx.saved_tensors
            input_shape = ctx.input_shape
            grad_input = None
            grad_weight = None
            grad_bias = None

            if ctx.needs_input_grad[0]:
                p = calc_output_padding(input_shape=input_shape, output_shape=grad_output.shape)
                op = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs)
                grad_input = op.apply(grad_output, weight, None)
                assert grad_input.shape == input_shape

            if ctx.needs_input_grad[1] and not weight_gradients_disabled:
                grad_weight = Conv2dGradWeight.apply(grad_output, input)
                assert grad_weight.shape == weight_shape

            if ctx.needs_input_grad[2]:
                grad_bias = grad_output.sum([0, 2, 3])

            return grad_input, grad_weight, grad_bias

    # Gradient with respect to the weights.
    class Conv2dGradWeight(torch.autograd.Function):
        @staticmethod
        def forward(ctx, grad_output, input):
            ctx.save_for_backward(
                grad_output if input.requires_grad else _null_tensor,
                input if grad_output.requires_grad else _null_tensor,
            )
            ctx.grad_output_shape = grad_output.shape
            ctx.input_shape = input.shape

            # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).
            if weight_shape[2:] == stride == dilation == (1, 1) and padding == (0, 0):
                a = grad_output.reshape(grad_output.shape[0], groups, grad_output.shape[1] // groups, -1).permute(1, 2, 0, 3).flatten(2)
                b = input.reshape(input.shape[0], groups, input.shape[1] // groups, -1).permute(1, 2, 0, 3).flatten(2)
                c = (b @ a.transpose(1, 2) if transpose else a @ b.transpose(1, 2)).reshape(weight_shape)
                return c.contiguous(memory_format=(torch.channels_last if input.stride(1) == 1 else torch.contiguous_format))

            # General case => cuDNN.
            name = 'aten::cudnn_convolution_transpose_backward_weight' if transpose else 'aten::cudnn_convolution_backward_weight'
            flags = [torch.backends.cudnn.benchmark, torch.backends.cudnn.deterministic, torch.backends.cudnn.allow_tf32]
            return torch._C._jit_get_operation(name)(weight_shape, grad_output, input, padding, stride, dilation, groups, *flags)

        @staticmethod
        def backward(ctx, grad2_grad_weight):
            grad_output, input = ctx.saved_tensors
            grad_output_shape = ctx.grad_output_shape
            input_shape = ctx.input_shape
            grad2_grad_output = None
            grad2_input = None

            if ctx.needs_input_grad[0]:
                grad2_grad_output = Conv2d.apply(input, grad2_grad_weight, None)
                assert grad2_grad_output.shape == grad_output_shape

            if ctx.needs_input_grad[1]:
                p = calc_output_padding(input_shape=input_shape, output_shape=grad_output_shape)
                op = _conv2d_gradfix(transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs)
                grad2_input = op.apply(grad_output, grad2_grad_weight, None)
                assert grad2_input.shape == input_shape

            return grad2_grad_output, grad2_input

    _conv2d_gradfix_cache[key] = Conv2d
    return Conv2d

#----------------------------------------------------------------------------


================================================
FILE: ADD/th_utils/ops/conv2d_resample.py
================================================
# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto.  Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.

"""2D convolution with optional up/downsampling."""

import torch

from .. import misc
from . import conv2d_gradfix
from . import upfirdn2d
from .upfirdn2d import _parse_padding
from .upfirdn2d import _get_filter_size

#----------------------------------------------------------------------------

def _get_weight_shape(w):
    with misc.suppress_tracer_warnings(): # this value will be treated as a constant
        shape = [int(sz) for sz in w.shape]
    misc.assert_shape(w, shape)
    return shape

#----------------------------------------------------------------------------

def _conv2d_wrapper(x, w, stride=1, padding=0, groups=1, transpose=False, flip_weight=True):
    """Wrapper for the underlying `conv2d()` and `conv_transpose2d()` implementations.
    """
    _out_channels, _in_channels_per_group, kh, kw = _get_weight_shape(w)

    # Flip weight if requested.
    # Note: conv2d() actually performs correlation (flip_weight=True) not convolution (flip_weight=False).
    if not flip_weight and (kw > 1 or kh > 1):
        w = w.flip([2, 3])

    # Execute using conv2d_gradfix.
    op = conv2d_gradfix.conv_transpose2d if transpose else conv2d_gradfix.conv2d
    return op(x, w, stride=stride, padding=padding, groups=groups)

#----------------------------------------------------------------------------

@misc.profiled_function
def conv2d_resample(x, w, f=None, up=1, down=1, padding=0, groups=1, flip_weight=True, flip_filter=False):
    r"""2D convolution with optional up/downsampling.

    Padding is performed only once at the beginning, not between the operations.

    Args:
        x:              Input tensor of shape
                        `[batch_size, in_channels, in_height, in_width]`.
        w:              Weight tensor of shape
                        `[out_channels, in_channels//groups, kernel_height, kernel_width]`.
        f:              Low-pass filter for up/downsampling. Must be prepared beforehand by
                        calling upfirdn2d.setup_filter(). None = identity (default).
        up:             Integer upsampling factor (default: 1).
        down:           Integer downsampling factor (default: 1).
        padding:        Padding with respect to the upsampled image. Can be a single number
                        or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]`
                        (default: 0).
        groups:         Split input channels into N groups (default: 1).
        flip_weight:    False = convolution, True = correlation (default: True).
        flip_filter:    False = convolution, True = correlation (default: False).

    Returns:
        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`.
    """
    # Validate arguments.
    assert isinstance(x, torch.Tensor) and (x.ndim == 4)
    assert isinstance(w, torch.Tensor) and (w.ndim == 4) and (w.dtype == x.dtype)
    assert f is None or (isinstance(f, torch.Tensor) and f.ndim in [1, 2] and f.dtype == torch.float32)
    assert isinstance(up, int) and (up >= 1)
    assert isinstance(down, int) and (down >= 1)
    assert isinstance(groups, int) and (groups >= 1)
    out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w)
    fw, fh = _get_filter_size(f)
    px0, px1, py0, py1 = _parse_padding(padding)

    # Adjust padding to account for up/downsampling.
    if up > 1:
        px0 += (fw + up - 1) // 2
        px1 += (fw - up) // 2
        py0 += (fh + up - 1) // 2
        py1 += (fh - up) // 2
    if down > 1:
        px0 += (fw - down + 1) // 2
        px1 += (fw - down) // 2
        py0 += (fh - down + 1) // 2
        py1 += (fh - down) // 2

    # Fast path: 1x1 convolution with downsampling only => downsample first, then convolve.
    if kw == 1 and kh == 1 and (down > 1 and up == 1):
        x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, padding=[px0,px1,py0,py1], flip_filter=flip_filter)
        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
        return x

    # Fast path: 1x1 convolution with upsampling only => convolve first, then upsample.
    if kw == 1 and kh == 1 and (up > 1 and down == 1):
        x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
        x = upfirdn2d.upfirdn2d(x=x, f=f, up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter)
        return x

    # Fast path: downsampling only => use strided convolution.
    if down > 1 and up == 1:
        x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0,px1,py0,py1], flip_filter=flip_filter)
        x = _conv2d_wrapper(x=x, w=w, stride=down, groups=groups, flip_weight=flip_weight)
        return x

    # Fast path: upsampling with optional downsampling => use transpose strided convolution.
    if up > 1:
        if groups == 1:
            w = w.transpose(0, 1)
        else:
            w = w.reshape(groups, out_channels // groups, in_channels_per_group, kh, kw)
            w = w.transpose(1, 2)
            w = w.reshape(groups * in_channels_per_group, out_channels // groups, kh, kw)
        px0 -= kw - 1
        px1 -= kw - up
        py0 -= kh - 1
        py1 -= kh - up
        pxt = max(min(-px0, -px1), 0)
        pyt = max(min(-py0, -py1), 0)
        x = _conv2d_wrapper(x=x, w=w, stride=up, padding=[pyt,pxt], groups=groups, transpose=True, flip_weight=(not flip_weight))
        x = upfirdn2d.upfirdn2d(x=x, f=f, padding=[px0+pxt,px1+pxt,py0+pyt,py1+pyt], gain=up**2, flip_filter=flip_filter)
        if down > 1:
            x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
        return x

    # Fast path: no up/downsampling, padding supported by the underlying implementation => use plain conv2d.
    if up == 1 and down == 1:
        if px0 == px1 and py0 == py1 and px0 >= 0 and py0 >= 0:
            return _conv2d_wrapper(x=x, w=w, padding=[py0,px0], groups=groups, flip_weight=flip_weight)

    # Fallback: Generic reference implementation.
    x = upfirdn2d.upfirdn2d(x=x, f=(f if up > 1 else None), up=up, padding=[px0,px1,py0,py1], gain=up**2, flip_filter=flip_filter)
    x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight)
    if down > 1:
        x = upfirdn2d.upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter)
    return x

#----------------------------------------------------------------------------


================================================
FILE: ADD/th_utils/ops/filtered_lrelu.cpp
================================================
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "filtered_lrelu.h"

//------------------------------------------------------------------------

static std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b, torch::Tensor si,
    int up, int down, int px0, int px1, int py0, int py1, int sx, int sy, float gain, float slope, float clamp, bool flip_filters, bool writeSigns)
{
    // Set CUDA device.
    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));

    // Validate arguments.
    TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() && b.device() == x.device(), "all input tensors must reside on the same device");
    TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat, "fu and fd must be float32");
    TORCH_CHECK(b.dtype() == x.dtype(), "x and b must have the same dtype");
    TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat, "x and b must be float16 or float32");
    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
    TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX && x.size(3) <= INT_MAX, "x is too large");
    TORCH_CHECK(x.numel() > 0, "x is empty");
    TORCH_CHECK((fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2), "fu and fd must be rank 1 or 2");
    TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX, "fu is too large");
    TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX, "fd is too large");
    TORCH_CHECK(fu.numel() > 0, "fu is empty");
    TORCH_CHECK(fd.numel() > 0, "fd is empty");
    TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1), "b must be a vector with the same number of channels as x");
    TORCH_CHECK(up >= 1 && down >= 1, "up and down must be at least 1");

    // Figure out how much shared memory is available on the device.
    int maxSharedBytes = 0;
    AT_CUDA_CHECK(cudaDeviceGetAttribute(&maxSharedBytes, cudaDevAttrMaxSharedMemoryPerBlockOptin, x.device().index()));
    int sharedKB = maxSharedBytes >> 10;

    // Populate enough launch parameters to check if a CUDA kernel exists.
    filtered_lrelu_kernel_params p;
    p.up      = up;
    p.down    = down;
    p.fuShape = make_int2((int)fu.size(-1), fu.dim() == 2 ? (int)fu.size(0) : 0); // shape [n, 0] indicates separable filter.
    p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);
    filtered_lrelu_kernel_spec test_spec = choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);
    if (!test_spec.exec)
    {
        // No kernel found - return empty tensors and indicate missing kernel with return code of -1.
        return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);
    }

    // Input/output element size.
    int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;

    // Input sizes.
    int64_t xw = (int)x.size(3);
    int64_t xh = (int)x.size(2);
    int64_t fut_w = (int)fu.size(-1) - 1;
    int64_t fut_h = (int)fu.size(0)  - 1;
    int64_t fdt_w = (int)fd.size(-1) - 1;
    int64_t fdt_h = (int)fd.size(0)  - 1;

    // Logical size of upsampled buffer.
    int64_t cw = xw * up + (px0 + px1) - fut_w;
    int64_t ch = xh * up + (py0 + py1) - fut_h;
    TORCH_CHECK(cw > fdt_w && ch > fdt_h, "upsampled buffer must be at least the size of downsampling filter");
    TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, "upsampled buffer is too large");

    // Compute output size and allocate.
    int64_t yw = (cw - fdt_w + (down - 1)) / down;
    int64_t yh = (ch - fdt_h + (down - 1)) / down;
    TORCH_CHECK(yw > 0 && yh > 0, "output must be at least 1x1");
    TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, "output is too large");
    torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(), x.suggest_memory_format());

    // Allocate sign tensor.
    torch::Tensor so;
    torch::Tensor s = si;
    bool readSigns = !!s.numel();
    int64_t sw_active = 0; // Active width of sign tensor.
    if (writeSigns)
    {
        sw_active = yw * down - (down - 1) + fdt_w;     // Active width in elements.
        int64_t sh = yh * down - (down - 1) + fdt_h;    // Height = active height.
        int64_t sw = (sw_active + 15) & ~15;            // Width  = active width in elements, rounded up to multiple of 16.
        TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, "signs is too large");
        s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2}, x.options().dtype(torch::kUInt8), at::MemoryFormat::Contiguous);
    }
    else if (readSigns)
        sw_active = s.size(3) << 2;

    // Validate sign tensor if in use.
    if (readSigns || writeSigns)
    {
        TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
        TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
        TORCH_CHECK(s.device() == x.device(), "signs must reside on the same device as x");
        TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
        TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1), "signs must have same batch & channels as x");
        TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX, "signs is too large");
    }

    // Populate rest of CUDA kernel parameters.
    p.x         = x.data_ptr();
    p.y         = y.data_ptr();
    p.b         = b.data_ptr();
    p.s         = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
    p.fu        = fu.data_ptr<float>();
    p.fd        = fd.data_ptr<float>();
    p.pad0      = make_int2(px0, py0);
    p.gain      = gain;
    p.slope     = slope;
    p.clamp     = clamp;
    p.flip      = (flip_filters) ? 1 : 0;
    p.xShape    = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
    p.yShape    = make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
    p.sShape    = (readSigns || writeSigns) ? make_int2((int)s.size(3), (int)s.size(2)) : make_int2(0, 0); // Width is in bytes. Contiguous.
    p.sOfs      = make_int2(sx, sy);
    p.swLimit   = (sw_active + 3) >> 2; // Rounded up to bytes.

    // x, y, b strides are in bytes.
    p.xStride   = make_longlong4(sz * x.stride(3), sz * x.stride(2), sz * x.stride(1), sz * x.stride(0));
    p.yStride   = make_longlong4(sz * y.stride(3), sz * y.stride(2), sz * y.stride(1), sz * y.stride(0));
    p.bStride   = sz * b.stride(0);

    // fu, fd strides are in elements.
    p.fuStride  = make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);
    p.fdStride  = make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);

    // Determine if indices don't fit in int32. Support negative strides although Torch currently never produces those.
    bool index64b = false;
    if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;
    if (std::min(x.size(0) * p.xStride.w, 0ll) + std::min(x.size(1) * p.xStride.z, 0ll) + std::min(x.size(2) * p.xStride.y, 0ll) + std::min(x.size(3) * p.xStride.x, 0ll) < -INT_MAX) index64b = true;
    if (std::max(x.size(0) * p.xStride.w, 0ll) + std::max(x.size(1) * p.xStride.z, 0ll) + std::max(x.size(2) * p.xStride.y, 0ll) + std::max(x.size(3) * p.xStride.x, 0ll) >  INT_MAX) index64b = true;
    if (std::min(y.size(0) * p.yStride.w, 0ll) + std::min(y.size(1) * p.yStride.z, 0ll) + std::min(y.size(2) * p.yStride.y, 0ll) + std::min(y.size(3) * p.yStride.x, 0ll) < -INT_MAX) index64b = true;
    if (std::max(y.size(0) * p.yStride.w, 0ll) + std::max(y.size(1) * p.yStride.z, 0ll) + std::max(y.size(2) * p.yStride.y, 0ll) + std::max(y.size(3) * p.yStride.x, 0ll) >  INT_MAX) index64b = true;
    if (s.numel() > INT_MAX) index64b = true;

    // Choose CUDA kernel.
    filtered_lrelu_kernel_spec spec = { 0 };
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "filtered_lrelu_cuda", [&]
    {
        if constexpr (sizeof(scalar_t) <= 4) // Exclude doubles. constexpr prevents template instantiation.
        {
            // Choose kernel based on index type, datatype and sign read/write modes.
            if      (!index64b &&  writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true,  false>(p, sharedKB);
            else if (!index64b && !writeSigns &&  readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true >(p, sharedKB);
            else if (!index64b && !writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(p, sharedKB);
            else if ( index64b &&  writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true,  false>(p, sharedKB);
            else if ( index64b && !writeSigns &&  readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true >(p, sharedKB);
            else if ( index64b && !writeSigns && !readSigns) spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(p, sharedKB);
        }
    });
    TORCH_CHECK(spec.exec, "internal error - CUDA kernel not found") // This should not happen because we tested earlier that kernel exists.

    // Launch CUDA kernel.
    void* args[] = {&p};
    int bx = spec.numWarps * 32;
    int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;
    int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;
    int gz = p.yShape.z * p.yShape.w;

    // Repeat multiple horizontal tiles in a CTA?
    if (spec.xrep)
    {
        p.tilesXrep = spec.xrep;
        p.tilesXdim = gx;

        gx = (gx + p.tilesXrep - 1) / p.tilesXrep;
        std::swap(gx, gy);
    }
    else
    {
        p.tilesXrep = 0;
        p.tilesXdim = 0;
    }

    // Launch filter setup kernel.
    AT_CUDA_CHECK(cudaLaunchKernel(spec.setup, 1, 1024, args, 0, at::cuda::getCurrentCUDAStream()));

    // Copy kernels to constant memory.
    if      ( writeSigns && !readSigns) AT_CUDA_CHECK((copy_filters<true,  false>(at::cuda::getCurrentCUDAStream())));
    else if (!writeSigns &&  readSigns) AT_CUDA_CHECK((copy_filters<false, true >(at::cuda::getCurrentCUDAStream())));
    else if (!writeSigns && !readSigns) AT_CUDA_CHECK((copy_filters<false, false>(at::cuda::getCurrentCUDAStream())));

    // Set cache and shared memory configurations for main kernel.
    AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));
    if (spec.dynamicSharedKB) // Need dynamically allocated shared memory?
        AT_CUDA_CHECK(cudaFuncSetAttribute(spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize, spec.dynamicSharedKB << 10));
    AT_CUDA_CHECK(cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));

    // Launch main kernel.
    const int maxSubGz = 65535; // CUDA maximum for block z dimension.
    for (int zofs=0; zofs < gz; zofs += maxSubGz) // Do multiple launches if gz is too big.
    {
        p.blockZofs = zofs;
        int subGz = std::min(maxSubGz, gz - zofs);
        AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args, spec.dynamicSharedKB << 10, at::cuda::getCurrentCUDAStream()));
    }

    // Done.
    return std::make_tuple(y, so, 0);
}

//------------------------------------------------------------------------

static torch::Tensor filtered_lrelu_act(torch::Tensor x, torch::Tensor si, int sx, int sy, float gain, float slope, float clamp, bool writeSigns)
{
    // Set CUDA device.
    TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));

    // Validate arguments.
    TORCH_CHECK(x.dim() == 4, "x must be rank 4");
    TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX && x.size(3) <= INT_MAX, "x is too large");
    TORCH_CHECK(x.numel() > 0, "x is empty");
    TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat || x.dtype() == torch::kDouble, "x must be float16, float32 or float64");

    // Output signs if we don't have sign input.
    torch::Tensor so;
    torch::Tensor s = si;
    bool readSigns = !!s.numel();
    if (writeSigns)
    {
        int64_t sw = x.size(3);
        sw = (sw + 15) & ~15; // Round to a multiple of 16 for coalescing.
        s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2}, x.options().dtype(torch::kUInt8), at::MemoryFormat::Contiguous);
    }

    // Validate sign tensor if in use.
    if (readSigns || writeSigns)
    {
        TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
        TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
        TORCH_CHECK(s.device() == x.device(), "signs must reside on the same device as x");
        TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
        TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1), "signs must have same batch & channels as x");
        TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX, "signs tensor is too large");
    }

    // Initialize CUDA kernel parameters.
    filtered_lrelu_act_kernel_params p;
    p.x         = x.data_ptr();
    p.s         = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
    p.gain      = gain;
    p.slope     = slope;
    p.clamp     = clamp;
    p.xShape    = make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
    p.xStride   = make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));
    p.sShape    = (readSigns || writeSigns) ? make_int2((int)s.size(3) << 2, (int)s.size(2)) : make_int2(0, 0); // Width is in elements. Contiguous.
    p.sOfs      = make_int2(sx, sy);

    // Choose CUDA kernel.
    void* func = 0;
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "filtered_lrelu_act_cuda", [&]
    {
        if (writeSigns)
            func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();
        else if (readSigns)
            func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();
        else
            func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();
    });
    TORCH_CHECK(func, "internal error - CUDA kernel not found");

    // Launch CUDA kernel.
    void* args[] = {&p};
    int bx = 128; // 4 warps per block.

    // Logical size of launch = writeSigns ? p.s : p.x
    uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;
    uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;
    uint32_t gz = p.xShape.z * p.xShape.w; // Same as in p.sShape if signs are in use.
    gx = (gx - 1) / bx + 1;

    // Make sure grid y and z dimensions are within CUDA launch limits. Kernel loops internally to do the rest.
    const uint32_t gmax = 65535;
    gy = std::min(gy, gmax);
    gz = std::min(gz, gmax);

    // Launch.
    AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0, at::cuda::getCurrentCUDAStream()));
    return so;
}

//------------------------------------------------------------------------

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
    m.def("filtered_lrelu",      &filtered_lrelu);      // The whole thing.
    m.def("filtered_lrelu_act_", &filtered_lrelu_act);  // Activation and sign tensor handling only. Modifies data tensor in-place.
}

//------------------------------------------------------------------------


================================================
FILE: ADD/th_utils/ops/filtered_lrelu.cu
================================================
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
//
// NVIDIA CORPORATION and its licensors retain all intellectual property
// and proprietary rights in and to this software, related documentation
// and any modifications thereto.  Any use, reproduction, disclosure or
// distribution of this software and related documentation without an express
// license agreement from NVIDIA CORPORATION is strictly prohibited.

#include <c10/util/Half.h>
#include "filtered_lrelu.h"
#include <cstdint>

//------------------------------------------------------------------------
// Helpers.

enum // Filter modes.
{
    MODE_SUSD = 0,  // Separable upsampling, separable downsampling.
    MODE_FUSD = 1,  // Full upsampling, separable downsampling.
    MODE_SUFD = 2,  // Separable upsampling, full downsampling.
    MODE_FUFD = 3,  // Full upsampling, full downsampling.
};

template <class T> struct InternalType;
template <> struct InternalType<double>
{
    typedef double scalar_t; typedef double2 vec2_t; typedef double4 vec4_t;
    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_double2(0, 0); }
    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_double4(0, 0, 0, 0); }
    __device__ __forceinline__ static double clamp(double x, double c) { return fmin(fmax(x, -c), c); }
};
template <> struct InternalType<float>
{
    typedef float scalar_t; typedef float2 vec2_t; typedef float4 vec4_t;
    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_float2(0, 0); }
    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_float4(0, 0, 0, 0); }
    __device__ __forceinline__ static float clamp(float x, float c) { return fminf(fmaxf(x, -c), c); }
};
template <> struct InternalType<c10::Half>
{
    typedef float scalar_t; typedef float2 vec2_t; typedef float4 vec4_t;
    __device__ __forceinline__ static vec2_t zero_vec2(void) { return make_float2(0, 0); }
    __device__ __forceinline__ static vec4_t zero_vec4(void) { return make_float4(0, 0, 0, 0); }
    __device__ __forceinline__ static float clamp(float x, float c) { return fminf(fmaxf(x, -c), c); }
};

#define MIN(A, B)       ((A) < (B) ? (A) : (B))
#define MAX(A, B)       ((A) > (B) ? (A) : (B))
#define CEIL_DIV(A, B) (((B)==1) ? (A) : \
                        ((B)==2) ? ((int)((A)+1) >> 1) : \
                        ((B)==4) ? ((int)((A)+3) >> 2) : \
                        (((A) + ((A) > 0 ? (B) - 1 : 0)) / (B)))

// This works only up to blocks of size 256 x 256 and for all N that are powers of two.
template <int N> __device__ __forceinline__ void fast_div_mod(int& x, int& y, unsigned int i)
{
    if ((N & (N-1)) && N <= 256)
        y = (i * ((1<<24)/N + 1)) >> 24; // Assumes N <= 256, i < N*256.
    else
        y = i/N;

    x = i - y*N;
}

// Type cast stride before reading it.
template <class T> __device__ __forceinline__ T get_stride(const int64_t& x)
{
    return *reinterpret_cast<const T*>(&x);
}

//------------------------------------------------------------------------
// Filters, setup kernel, copying function.

#define MAX_FILTER_SIZE 32

// Combined up/down filter buffers so that transfer can be done with one copy.
__device__              float g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE]; // Filters in global memory, written by setup kernel.
__device__ __constant__ float c_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE]; // Filters in constant memory, read by main kernel.

// Accessors to combined buffers to index up/down filters individually.
#define c_fu (c_fbuf)
#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
#define g_fu (g_fbuf)
#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)

// Set up filters into global memory buffer.
static __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p)
{
    for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE; idx += blockDim.x)
    {
        int x, y;
        fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);

        int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);
        int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);
        if (p.fuShape.y > 0)
            g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y) ? 0.0f : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];
        else
            g_fu[idx] = (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];

        int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);
        int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);
        if (p.fdShape.y > 0)
            g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y) ? 0.0f : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];
        else
            g_fd[idx] = (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];
    }
}

// Host function to copy filters written by setup kernel into constant buffer for main kernel.
template <bool, bool> static cudaError_t copy_filters(cudaStream_t stream)
{
    void* src = 0;
    cudaError_t err = cudaGetSymbolAddress(&src, g_fbuf);
    if (err) return err;
    return cudaMemcpyToSymbolAsync(c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream);
}

//------------------------------------------------------------------------
// Coordinate spaces:
// - Relative to input tensor:      inX, inY, tileInX, tileInY
// - Relative to input tile:        relInX, relInY, tileInW, tileInH
// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH
// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH
// - Relative to output tensor:     outX, outY, tileOutX, tileOutY
//
// Relationships between coordinate spaces:
// - inX = tileInX + relInX
// - inY = tileInY + relInY
// - relUpX = relInX * up + phaseInX
// - relUpY = relInY * up + phaseInY
// - relUpX = relOutX * down
// - relUpY = relOutY * down
// - outX = tileOutX + relOutX
// - outY = tileOutY + relOutY

extern __shared__ char s_buf_raw[]; // When sharedKB <= 48, allocate shared memory statically inside the kernel, otherwise use the externally allocated shared memory buffer.

template <class T, class index_t, int sharedKB, bool signWrite, bool signRead, int filterMode, int up, int fuSize, int down, int fdSize, int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep, bool enableWriteSkip>
static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p)
{
    // Check that we don't try to support non-existing filter modes.
    static_assert(up   == 1 || up   == 2 || up   == 4, "only up=1, up=2, up=4 scales supported");
    static_assert(down == 1 || down == 2 || down == 4, "only down=1, down=2, down=4 scales supported");
    static_assert(fuSize >= up,   "upsampling filter size must be at least upsampling factor");
    static_assert(fdSize >= down, "downsampling filter size must be at least downsampling factor");
    static_assert(fuSize % up   == 0, "upsampling filter size must be divisible with upsampling factor");
    static_assert(fdSize % down == 0, "downsampling filter size must be divisible with downsampling factor");
    static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE, "filter size greater than MAX_FILTER_SIZE");
    static_assert(up   != 1 || (fuSize == 1 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)), "up=1 supported only for 1x1 full filters");
    static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)), "down=1 supported only for 1x1 full filters");
    static_assert(!(up   == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)), "full filters not supported for up=4");
    static_assert(!(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)), "full filters not supported for down=4");

    // Static definitions.
    typedef typename InternalType<T>::scalar_t scalar_t;
    typedef typename InternalType<T>::vec2_t vec2_t;
    typedef typename InternalType<T>::vec4_t vec4_t;
    const int tileUpW    = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) & ~3;  // Upsampled tile width, rounded up to multiple of 4.
    const int tileUpH    = tileOutH * down + (fdSize - 1) - (down - 1);             // Upsampled tile height.
    const int tileInW    = CEIL_DIV(tileUpW  + (fuSize - 1), up);                   // Input tile width.
    const int tileInH    = CEIL_DIV(tileUpH  + (fuSize - 1), up);                   // Input tile height.
    const int tileUpH_up = CEIL_DIV(tileUpH, up) * up;                              // Upsampled tile height rounded up to a multiple of up.
    const int tileInH_up = CEIL_DIV(tileUpH_up + (fuSize - 1), up);                 // For allocations only, to avoid shared memory read overruns with up=2 and up=4.

    // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.
    const bool downInline = (down == 1) && ((up == 1 && filterMode == MODE_FUFD) || (up == 2 && filterMode == MODE_SUFD));

    // Sizes of logical buffers.
    const int szIn    = tileInH_up * tileInW;
    const int szUpX   = tileInH_up * tileUpW;
    const int szUpXY  = downInline ? 0 : (tileUpH * tileUpW);
    const int szDownX = tileUpH * tileOutW;

    // Sizes for shared memory arrays.
    const int s_buf0_size_base =
        (filterMode == MODE_SUSD) ? MAX(szIn, szUpXY) :
        (filterMode == MODE_FUSD) ? MAX(szIn, szDownX) :
        (filterMode == MODE_SUFD) ? MAX(szIn, szUpXY) :
        (filterMode == MODE_FUFD) ? szIn :
        -1;
    const int s_buf1_size_base =
        (filterMode == MODE_SUSD) ? MAX(szUpX, szDownX) :
        (filterMode == MODE_FUSD) ? szUpXY :
        (filterMode == MODE_SUFD) ? szUpX  :
        (filterMode == MODE_FUFD) ? szUpXY :
        -1;

    // Ensure U128 alignment.
    const int s_buf0_size = (s_buf0_size_base + 3) & ~3;
    const int s_buf1_size = (s_buf1_size_base + 3) & ~3;

    // Check at compile time that we don't use too much shared memory.
    static_assert((s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10), "shared memory overflow");

    // Declare shared memory arrays.
    scalar_t* s_buf0;
    scalar_t* s_buf1;
    if (sharedKB <= 48)
    {
        // Allocate shared memory arrays here.
        __shared__ scalar_t s_buf0_st[(sharedKB > 48) ? (1<<24) : (s_buf0_size + s_buf1_size)]; // Prevent launching if this isn't optimized away when unused.
        s_buf0 = s_buf0_st;
        s_buf1 = s_buf0 + s_buf0_size;
    }
    else
    {
        // Use the dynamically allocated shared memory array.
        s_buf0 = (scalar_t*)s_buf_raw;
        s_buf1 = s_buf0 + s_buf0_size;
    }

    // Pointers to the buffers.
    scalar_t* s_tileIn;       // Input tile:                      [relInX * tileInH + relInY]
    scalar_t* s_tileUpX;      // After horizontal upsampling:     [relInY * tileUpW + relUpX]
    scalar_t* s_tileUpXY;     // After upsampling:                [relUpY * tileUpW + relUpX]
    scalar_t* s_tileDownX;    // After horizontal downsampling:   [relUpY * tileOutW + relOutX]
    if (filterMode == MODE_SUSD)
    {
        s_tileIn    = s_buf0;
        s_tileUpX   = s_buf1;
        s_tileUpXY  = s_buf0;
        s_tileDownX = s_buf1;
    }
    else if (filterMode == MODE_FUSD)
    {
        s_tileIn    = s_buf0;
        s_tileUpXY  = s_buf1;
        s_tileDownX = s_buf0;
    }
    else if (filterMode == MODE_SUFD)
    {
        s_tileIn    = s_buf0;
        s_tileUpX   = s_buf1;
        s_tileUpXY  = s_buf0;
    }
    else if (filterMode == MODE_FUFD)
    {
        s_tileIn    = s_buf0;
        s_tileUpXY  = s_buf1;
    }

    // Allow large grids in z direction via per-launch offset.
    int channelIdx = blockIdx.z + p.blockZofs;
    int batchIdx = channelIdx / p.yShape.z;
    channelIdx -= batchIdx * p.yShape.z;

    // Offset to output feature map. In bytes.
    index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) + batchIdx * get_stride<index_t>(p.yStride.w);

    // Sign shift amount.
    uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;

    // Inner tile loop.
    #pragma unroll 1
    for (int tileIdx = 0; !enableXrep || (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y)); tileIdx++)
    {
        // Locate output tile.
        int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;
        int tileOutX = tileX * tileOutW;
        int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;

        // Locate input tile.
        int tmpX = tileOutX * down - p.pad0.x;
        int tmpY = tileOutY * down - p.pad0.y;
        int tileInX = CEIL_DIV(tmpX, up);
        int tileInY = CEIL_DIV(tmpY, up);
        const int phaseInX = tileInX * up - tmpX;
        const int phaseInY = tileInY * up - tmpY;

        // Extra sync if input and output buffers are the same and we are not on first tile.
        if (enableXrep && tileIdx > 0 && (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) || (filterMode == MODE_FUFD && downInline)))
            __syncthreads();

        // Load input tile & apply bias. Unrolled.
        scalar_t b = (scalar_t)*(const T*)((const char*)p.b + (channelIdx * get_stride<index_t>(p.bStride)));
        index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) + batchIdx * get_stride<index_t>(p.xStride.w);
        int idx = threadIdx.x;
        const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);
        #pragma unroll
        for (int loop = 0; loop < loopCountIN; loop++)
        {
            int relInX, relInY;
            fast_div_mod<tileInW>(relInX, relInY, idx);
            int inX = tileInX + relInX;
            int inY = tileInY + relInY;
            scalar_t v = 0;

            if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)
                v = (scalar_t)*((const T*)((const char*)p.x + (inX * get_stride<index_t>(p.xStride.x) + inY * get_stride<index_t>(p.xStride.y) + mapOfsIn))) + b;

            bool skip = (loop == loopCountIN-1) && (idx >= tileInW * tileInH);
            if (!skip)
                s_tileIn[idx] = v;

            idx += threadsPerBlock;
        }

        if (filterMode == MODE_SUSD || filterMode == MODE_SUFD) // Separable upsampling filter.
        {
            // Horizontal upsampling.
            __syncthreads();
            if (up == 4)
            {
                for (int idx = threadIdx.x*up; idx < tileUpW * tileInH; idx += blockDim.x*up)
                {
                    int relUpX0, relInY;
                    fast_div_mod<tileUpW>(relUpX0, relInY, idx);
                    int relInX0 = relUpX0 / up;
                    int src0 = relInX0 + tileInW * relInY;
                    int dst = relInY * tileUpW + relUpX0;
                    vec4_t v = InternalType<T>::zero_vec4();
                    scalar_t a = s_tileIn[src0];
                    if (phaseInX == 0)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileIn[src0 + step + 1];
                            v.y += a * (scalar_t)c_fu[step * up + 3];
                            v.z += a * (scalar_t)c_fu[step * up + 2];
                            v.w += a * (scalar_t)c_fu[step * up + 1];
                        }
                    }
                    else if (phaseInX == 1)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 1];
                            v.y += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileIn[src0 + step + 1];
                            v.z += a * (scalar_t)c_fu[step * up + 3];
                            v.w += a * (scalar_t)c_fu[step * up + 2];
                        }
                    }
                    else if (phaseInX == 2)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 2];
                            v.y += a * (scalar_t)c_fu[step * up + 1];
                            v.z += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileIn[src0 + step + 1];
                            v.w += a * (scalar_t)c_fu[step * up + 3];
                        }
                    }
                    else // (phaseInX == 3)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 3];
                            v.y += a * (scalar_t)c_fu[step * up + 2];
                            v.z += a * (scalar_t)c_fu[step * up + 1];
                            v.w += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileIn[src0 + step + 1];
                        }
                    }
                    s_tileUpX[dst+0] = v.x;
                    s_tileUpX[dst+1] = v.y;
                    s_tileUpX[dst+2] = v.z;
                    s_tileUpX[dst+3] = v.w;
                }
            }
            else if (up == 2)
            {
                bool p0 = (phaseInX == 0);
                for (int idx = threadIdx.x*up; idx < tileUpW * tileInH; idx += blockDim.x*up)
                {
                    int relUpX0, relInY;
                    fast_div_mod<tileUpW>(relUpX0, relInY, idx);
                    int relInX0 = relUpX0 / up;
                    int src0 = relInX0 + tileInW * relInY;
                    int dst = relInY * tileUpW + relUpX0;
                    vec2_t v = InternalType<T>::zero_vec2();
                    scalar_t a = s_tileIn[src0];
                    if (p0) // (phaseInX == 0)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileIn[src0 + step + 1];
                            v.y += a * (scalar_t)c_fu[step * up + 1];
                        }
                    }
                    else // (phaseInX == 1)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 1];
                            v.y += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileIn[src0 + step + 1];
                        }
                    }
                    s_tileUpX[dst+0] = v.x;
                    s_tileUpX[dst+1] = v.y;
                }
            }

            // Vertical upsampling & nonlinearity.

            __syncthreads();
            int groupMask = 15 << ((threadIdx.x & 31) & ~3);
            int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH : 0; // Skip already written signs.
            int sShapeMaxY = MIN(p.sShape.y, tileOutY * down + tileUpH); // Avoid out-of-tile sign writes.
            if (up == 4)
            {
                minY -= 3; // Adjust according to block height.
                for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up; idx += blockDim.x)
                {
                    int relUpX, relInY0;
                    fast_div_mod<tileUpW>(relUpX, relInY0, idx);
                    int relUpY0 = relInY0 * up;
                    int src0 = relInY0 * tileUpW + relUpX;
                    int dst = relUpY0 * tileUpW + relUpX;
                    vec4_t v = InternalType<T>::zero_vec4();

                    scalar_t a = s_tileUpX[src0];
                    if (phaseInY == 0)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
                            v.y += a * (scalar_t)c_fu[step * up + 3];
                            v.z += a * (scalar_t)c_fu[step * up + 2];
                            v.w += a * (scalar_t)c_fu[step * up + 1];
                        }
                    }
                    else if (phaseInY == 1)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 1];
                            v.y += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
                            v.z += a * (scalar_t)c_fu[step * up + 3];
                            v.w += a * (scalar_t)c_fu[step * up + 2];
                        }
                    }
                    else if (phaseInY == 2)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 2];
                            v.y += a * (scalar_t)c_fu[step * up + 1];
                            v.z += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
                            v.w += a * (scalar_t)c_fu[step * up + 3];
                        }
                    }
                    else // (phaseInY == 3)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 3];
                            v.y += a * (scalar_t)c_fu[step * up + 2];
                            v.z += a * (scalar_t)c_fu[step * up + 1];
                            v.w += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
                        }
                    }

                    int x = tileOutX * down + relUpX;
                    int y = tileOutY * down + relUpY0;
                    int signX = x + p.sOfs.x;
                    int signY = y + p.sOfs.y;
                    int signZ = blockIdx.z + p.blockZofs;
                    int signXb = signX >> 2;
                    index_t si0 = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
                    index_t si1 = si0 + p.sShape.x;
                    index_t si2 = si0 + p.sShape.x * 2;
                    index_t si3 = si0 + p.sShape.x * 3;

                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
                    v.z *= (scalar_t)((float)up * (float)up * p.gain);
                    v.w *= (scalar_t)((float)up * (float)up * p.gain);

                    if (signWrite)
                    {
                        if (!enableWriteSkip)
                        {
                            // Determine and write signs.
                            int sx = __float_as_uint(v.x) >> 31 <<  0;
                            int sy = __float_as_uint(v.y) >> 31 <<  8;
                            int sz = __float_as_uint(v.z) >> 31 << 16;
                            int sw = __float_as_uint(v.w) >> 31 << 24;
                            if (sx) v.x *= p.slope;
                            if (sy) v.y *= p.slope;
                            if (sz) v.z *= p.slope;
                            if (sw) v.w *= p.slope;
                            if (fabsf(v.x) > p.clamp) { sx = 2 <<  0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
                            if (fabsf(v.y) > p.clamp) { sy = 2 <<  8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
                            if (fabsf(v.z) > p.clamp) { sz = 2 << 16; v.z = InternalType<T>::clamp(v.z, p.clamp); }
                            if (fabsf(v.w) > p.clamp) { sw = 2 << 24; v.w = InternalType<T>::clamp(v.w, p.clamp); }

                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
                            {
                                // Combine signs.
                                uint32_t s = sx + sy + sw + sz;
                                s <<= (signX & 3) << 1;
                                s |= __shfl_xor_sync(groupMask, s, 1);
                                s |= __shfl_xor_sync(groupMask, s, 2);

                                // Write signs.
                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
                                if ((uint32_t)(signY + 2) < sShapeMaxY) { p.s[si2] = (unsigned char)(s >> 16); }
                                if ((uint32_t)(signY + 3) < sShapeMaxY) { p.s[si3] = (unsigned char)(s >> 24); }
                            }
                        }
                        else
                        {
                            // Determine and write signs.
                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
                            {
                                int sx = __float_as_uint(v.x) >> 31 <<  0;
                                int sy = __float_as_uint(v.y) >> 31 <<  8;
                                int sz = __float_as_uint(v.z) >> 31 << 16;
                                int sw = __float_as_uint(v.w) >> 31 << 24;
                                if (sx) v.x *= p.slope;
                                if (sy) v.y *= p.slope;
                                if (sz) v.z *= p.slope;
                                if (sw) v.w *= p.slope;
                                if (fabsf(v.x) > p.clamp) { sx = 2 <<  0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
                                if (fabsf(v.y) > p.clamp) { sy = 2 <<  8; v.y = InternalType<T>::clamp(v.y, p.clamp); }
                                if (fabsf(v.z) > p.clamp) { sz = 2 << 16; v.z = InternalType<T>::clamp(v.z, p.clamp); }
                                if (fabsf(v.w) > p.clamp) { sw = 2 << 24; v.w = InternalType<T>::clamp(v.w, p.clamp); }

                                // Combine signs.
                                uint32_t s = sx + sy + sw + sz;
                                s <<= (signX & 3) << 1;
                                s |= __shfl_xor_sync(groupMask, s, 1);
                                s |= __shfl_xor_sync(groupMask, s, 2);

                                // Write signs.
                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
                                if ((uint32_t)(signY + 2) < sShapeMaxY) { p.s[si2] = (unsigned char)(s >> 16); }
                                if ((uint32_t)(signY + 3) < sShapeMaxY) { p.s[si3] = (unsigned char)(s >> 24); }
                            }
                            else
                            {
                                // Just compute the values.
                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
                                if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
                                if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
                            }
                        }
                    }
                    else if (signRead) // Read signs and apply.
                    {
                        if ((uint32_t)signXb < p.swLimit)
                        {
                            int ss = (signX & 3) << 1;
                            if ((uint32_t)(signY + 0) < p.sShape.y) { int s = p.s[si0] >> ss; if (s & 1) v.x *= p.slope; if (s & 2) v.x = 0.f; }
                            if ((uint32_t)(signY + 1) < p.sShape.y) { int s = p.s[si1] >> ss; if (s & 1) v.y *= p.slope; if (s & 2) v.y = 0.f; }
                            if ((uint32_t)(signY + 2) < p.sShape.y) { int s = p.s[si2] >> ss; if (s & 1) v.z *= p.slope; if (s & 2) v.z = 0.f; }
                            if ((uint32_t)(signY + 3) < p.sShape.y) { int s = p.s[si3] >> ss; if (s & 1) v.w *= p.slope; if (s & 2) v.w = 0.f; }
                        }
                    }
                    else // Forward pass with no sign write.
                    {
                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
                        if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
                        if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
                    }

                    s_tileUpXY[dst + 0 * tileUpW] = v.x;
                    if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;
                    if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;
                    if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;
                }
            }
            else if (up == 2)
            {
                minY -= 1; // Adjust according to block height.
                for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up; idx += blockDim.x)
                {
                    int relUpX, relInY0;
                    fast_div_mod<tileUpW>(relUpX, relInY0, idx);
                    int relUpY0 = relInY0 * up;
                    int src0 = relInY0 * tileUpW + relUpX;
                    int dst = relUpY0 * tileUpW + relUpX;
                    vec2_t v = InternalType<T>::zero_vec2();

                    scalar_t a = s_tileUpX[src0];
                    if (phaseInY == 0)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
                            v.y += a * (scalar_t)c_fu[step * up + 1];
                        }
                    }
                    else // (phaseInY == 1)
                    {
                        #pragma unroll
                        for (int step = 0; step < fuSize / up; step++)
                        {
                            v.x += a * (scalar_t)c_fu[step * up + 1];
                            v.y += a * (scalar_t)c_fu[step * up + 0];
                            a = s_tileUpX[src0 + (step + 1) * tileUpW];
                        }
                    }

                    int x = tileOutX * down + relUpX;
                    int y = tileOutY * down + relUpY0;
                    int signX = x + p.sOfs.x;
                    int signY = y + p.sOfs.y;
                    int signZ = blockIdx.z + p.blockZofs;
                    int signXb = signX >> 2;
                    index_t si0 = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
                    index_t si1 = si0 + p.sShape.x;

                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
                    v.y *= (scalar_t)((float)up * (float)up * p.gain);

                    if (signWrite)
                    {
                        if (!enableWriteSkip)
                        {
                            // Determine and write signs.
                            int sx = __float_as_uint(v.x) >> 31 << 0;
                            int sy = __float_as_uint(v.y) >> 31 << 8;
                            if (sx) v.x *= p.slope;
                            if (sy) v.y *= p.slope;
                            if (fabsf(v.x) > p.clamp) { sx = 2 << 0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
                            if (fabsf(v.y) > p.clamp) { sy = 2 << 8; v.y = InternalType<T>::clamp(v.y, p.clamp); }

                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
                            {
                                // Combine signs.
                                int s = sx + sy;
                                s <<= signXo;
                                s |= __shfl_xor_sync(groupMask, s, 1);
                                s |= __shfl_xor_sync(groupMask, s, 2);

                                // Write signs.
                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
                            }
                        }
                        else
                        {
                            // Determine and write signs.
                            if ((uint32_t)signXb < p.swLimit && signY >= minY)
                            {
                                int sx = __float_as_uint(v.x) >> 31 << 0;
                                int sy = __float_as_uint(v.y) >> 31 << 8;
                                if (sx) v.x *= p.slope;
                                if (sy) v.y *= p.slope;
                                if (fabsf(v.x) > p.clamp) { sx = 2 << 0; v.x = InternalType<T>::clamp(v.x, p.clamp); }
                                if (fabsf(v.y) > p.clamp) { sy = 2 << 8; v.y = InternalType<T>::clamp(v.y, p.clamp); }

                                // Combine signs.
                                int s = sx + sy;
                                s <<= signXo;
                                s |= __shfl_xor_sync(groupMask, s, 1);
                                s |= __shfl_xor_sync(groupMask, s, 2);

                                // Write signs.
                                if ((uint32_t)(signY + 0) < sShapeMaxY) { p.s[si0] = (unsigned char)(s >>  0); }
                                if ((uint32_t)(signY + 1) < sShapeMaxY) { p.s[si1] = (unsigned char)(s >>  8); }
                            }
                            else
                            {
                                // Just compute the values.
                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
                            }
                        }
                    }
                    else if (signRead) // Read signs and apply.
                    {
                        if ((uint32_t)signXb < p.swLimit)
                        {
                            if ((uint32_t)(signY + 0) < p.sShape.y) { int s = p.s[si0] >> signXo; if (s & 1) v.x *= p.slope; if (s & 2) v.x = 0.f; }
                            if ((uint32_t)(signY + 1) < p.sShape.y) { int s = p.s[si1] >> signXo; if (s & 1) v.y *= p.slope; if (s & 2) v.y = 0.f; }
                        }
                    }
                    else // Forward pass with no sign write.
                    {
                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
                    }

                    if (!downInline)
                    {
                        // Write into temporary buffer.
                        s_tileUpXY[dst] = v.x;
                        if (relUpY0 < tileUpH - 1)
                            s_tileUpXY[dst + tileUpW] = v.y;
                    }
                    else
                    {
                        // Write directly into output buffer.
                        if ((uint32_t)x < p.yShape.x)
                        {
                            int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);
                            index_t ofs = x * get_stride<index_t>(p.yStride.x) + y * get_stride<index_t>(p.yStride.y) + mapOfsOut;
                            if ((uint32_t)y + 0 < p.yShape.y) *((T*)((char*)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);
                            if ((uint32_t)y + 1 < ymax) *((T*)((char*)p.y + ofs + get_stride<index_t>(p.yStride.y))) = (T)(v.y * (scalar_t)c_fd[0]);
                        }
                    }
                }
            }
        }
        else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD)
        {
            // Full upsampling filter.

            if (up == 2)
            {
                // 2 x 2-wide.
                __syncthreads();
                int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y : 0; // Skip already written signs.
                for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH; idx += blockDim.x * 4)
                {
                    int relUpX0, relUpY0;
                    fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
                    int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);
                    int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);
                    int src0 = relInX0 + tileInW * relInY0;
                    int tap0y = (relInY0 * up + phaseInY - relUpY0);

                    #define X_LOOP(TAPY, PX) \
                        for (int sx = 0; sx < fuSize / up; sx++) \
                        { \
                            v.x += a * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
                            v.z += b * (scalar_t)c_fu[(sx * up + (((PX) - 0) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; if ((PX) == 0) { a = b; b = s_tileIn[src0 + 2 + sx + sy * tileInW]; } \
                            v.y += a * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
                            v.w += b * (scalar_t)c_fu[(sx * up + (((PX) - 1) & (up - 1))) + (sy * up + (TAPY)) * MAX_FILTER_SIZE]; if ((PX) == 1) { a = b; b = s_tileIn[src0 + 2 + sx + sy * tileInW]; } \
                        }

                    vec4_t v = InternalType<T>::zero_vec4();
                    if (tap0y == 0 && phaseInX == 0)
                        #pragma unroll
                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
                            #pragma unroll
                            X_LOOP(0, 0) }
                    if (tap0y == 0 && phaseInX == 1)
                        #pragma unroll
                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
                            #pragma unroll
                            X_LOOP(0, 1) }
                    if (tap0y == 1 && phaseInX == 0)
                        #pragma unroll
                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
                            #pragma unroll
                            X_LOOP(1, 0) }
                    if (tap0y == 1 && phaseInX == 1)
                        #pragma unroll
                        for (int sy = 0; sy < fuSize / up; sy++) { scalar_t a = s_tileIn[src0 + sy * tileInW]; scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
                            #pragma unroll
                            X_LOOP(1, 1) }

                    #undef X_LOOP

                    int x = tileOutX * down + relUpX0;
                    int y = tileOutY * down + relUpY0;
                    int signX = x + p.sOfs.x;
                    int signY = y + p.sOfs.y;
                    int signZ = blockIdx.z + p.blockZofs;
                    int signXb = signX >> 2;
                    index_t si = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);

                    v.x *= (scalar_t)((float)up * (float)up * p.gain);
                    v.y *= (scalar_t)((float)up * (float)up * p.gain);
                    v.z *= (scalar_t)((float)up * (float)up * p.gain);
                    v.w *= (scalar_t)((float)up * (float)up * p.gain);

                    if (signWrite)
                    {
                        if (!enableWriteSkip)
                        {
                            // Determine and write signs.
                            int sx = __float_as_uint(v.x) >> 31;
                            int sy = __float_as_uint(v.y) >> 31;
                            int sz = __float_as_uint(v.z) >> 31;
                            int sw = __float_as_uint(v.w) >> 31;
                            if (sx) v.x *= p.slope; if (fabsf(v.x) > p.clamp) { sx = 2; v.x = InternalType<T>::clamp(v.x, p.clamp); }
                            if (sy) v.y *= p.slope; if (fabsf(v.y) > p.clamp) { sy = 2; v.y = InternalType<T>::clamp(v.y, p.clamp); }
                            if (sz) v.z *= p.slope; if (fabsf(v.z) > p.clamp) { sz = 2; v.z = InternalType<T>::clamp(v.z, p.clamp); }
                            if (sw) v.w *= p.slope; if (fabsf(v.w) > p.clamp) { sw = 2; v.w = InternalType<T>::clamp(v.w, p.clamp); }

                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
                            {
                                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
                            }
                        }
                        else
                        {
                            // Determine and write signs.
                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
                            {
                                int sx = __float_as_uint(v.x) >> 31;
                                int sy = __float_as_uint(v.y) >> 31;
                                int sz = __float_as_uint(v.z) >> 31;
                                int sw = __float_as_uint(v.w) >> 31;
                                if (sx) v.x *= p.slope; if (fabsf(v.x) > p.clamp) { sx = 2; v.x = InternalType<T>::clamp(v.x, p.clamp); }
                                if (sy) v.y *= p.slope; if (fabsf(v.y) > p.clamp) { sy = 2; v.y = InternalType<T>::clamp(v.y, p.clamp); }
                                if (sz) v.z *= p.slope; if (fabsf(v.z) > p.clamp) { sz = 2; v.z = InternalType<T>::clamp(v.z, p.clamp); }
                                if (sw) v.w *= p.slope; if (fabsf(v.w) > p.clamp) { sw = 2; v.w = InternalType<T>::clamp(v.w, p.clamp); }

                                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
                            }
                            else
                            {
                                // Just compute the values.
                                if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
                                if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
                                if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
                                if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
                            }
                        }
                    }
                    else if (signRead) // Read sign and apply.
                    {
                        if ((uint32_t)signY < p.sShape.y)
                        {
                            int s = 0;
                            if ((uint32_t)signXb     < p.swLimit) s  = p.s[si];
                            if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;
                            s >>= (signX & 3) << 1;
                            if (s & 0x01) v.x *= p.slope; if (s & 0x02) v.x = 0.f;
                            if (s & 0x04) v.y *= p.slope; if (s & 0x08) v.y = 0.f;
                            if (s & 0x10) v.z *= p.slope; if (s & 0x20) v.z = 0.f;
                            if (s & 0x40) v.w *= p.slope; if (s & 0x80) v.w = 0.f;
                        }
                    }
                    else // Forward pass with no sign write.
                    {
                        if (v.x < 0.f) v.x *= p.slope; v.x = InternalType<T>::clamp(v.x, p.clamp);
                        if (v.y < 0.f) v.y *= p.slope; v.y = InternalType<T>::clamp(v.y, p.clamp);
                        if (v.z < 0.f) v.z *= p.slope; v.z = InternalType<T>::clamp(v.z, p.clamp);
                        if (v.w < 0.f) v.w *= p.slope; v.w = InternalType<T>::clamp(v.w, p.clamp);
                    }

                    s_tileUpXY[idx + 0] = v.x;
                    s_tileUpXY[idx + 1] = v.y;
                    s_tileUpXY[idx + 2] = v.z;
                    s_tileUpXY[idx + 3] = v.w;
                }
            }
            else if (up == 1)
            {
                __syncthreads();
                uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);
                int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH : 0; // Skip already written signs.
                for (int idx = threadIdx.x; idx < tileUpW * tileUpH; idx += blockDim.x)
                {
                    int relUpX0, relUpY0;
                    fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
                    scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0]; // 1x1 filter.

                    int x = tileOutX * down + relUpX0;
                    int y = tileOutY * down + relUpY0;
                    int signX = x + p.sOfs.x;
                    int signY = y + p.sOfs.y;
                    int signZ = blockIdx.z + p.blockZofs;
                    int signXb = signX >> 2;
                    index_t si = signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
                    v *= (scalar_t)((float)up * (float)up * p.gain);

                    if (signWrite)
                    {
                        if (!enableWriteSkip)
                        {
                            // Determine and write sign.
                            uint32_t s = 0;
                            uint32_t signXbit = (1u << signXo);
                            if (v < 0.f)
                            {
                                s = signXbit;
                                v *= p.slope;
                            }
                            if (fabsf(v) > p.clamp)
                            {
                                s = signXbit * 2;
                                v = InternalType<T>::clamp(v, p.clamp);
                            }
                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
                            {
                                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
                                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
                                p.s[si] = s;                            // Write.
                            }
                        }
                        else
                        {
                            // Determine and write sign.
                            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y && signY >= minY)
                            {
                                uint32_t s = 0;
                                uint32_t signXbit = (1u << signXo);
                                if (v < 0.f)
                                {
                                    s = signXbit;
                                    v *= p.slope;
                                }
                                if (fabsf(v) > p.clamp)
                                {
                                    s = signXbit * 2;
                                    v = InternalType<T>::clamp(v, p.clamp);
                                }
                                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
                                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
                                p.s[si] = s;                            // Write.
                            }
                            else
                            {
                                // Just compute the value.
                                if (v < 0.f) v *= p.slope;
                                v = InternalType<T>::clamp(v, p.clamp);
                            }
                        }
                    }
                    else if (signRead)
                    {
                        // Read sign and apply if within sign tensor bounds.
                        if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y)
                        {
                            int s = p.s[si];
                            s >>= signXo;
                            if (s & 1) v *= p.slope;
                            if (s & 2) v = 0.f;
                        }
                    }
                    else // Forward pass with no sign write.
                    {
                        if (v < 0.f) v *= p.slope;
                        v = InternalType<T>::clamp(v, p.clamp);
                    }

                    if (!downInline) // Write into temporary buffer.
                        s_tileUpXY[idx] = v;
                    else if ((uint32_t)x < p.yShape.x && (uint32_t)y < p.yShape.y) // Write directly into output buffer
                        *((T*)((char*)p.y + (x * get_stride<index_t>(p.yStride.x) + y * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);
                }
            }
        }

        // Downsampling.
        if (filterMode == MODE_SUSD || filterMode == MODE_FUSD)
        {
            // Horizontal downsampling.
            __syncthreads();
            if (down == 4 && tileOutW % 4 == 0)
            {
                // Calculate 4 pixels at a time.
                for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH; idx += blockDim.x * 4)
                {
                    int relOutX0, relUpY;
                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
                    int relUpX0 = relOutX0 * down;
                    int src0 = relUpY * tileUpW + relUpX0;
                    vec4_t v = InternalType<T>::zero_vec4();
                    #pragma unroll
                    for (int step = 0; step < fdSize; step++)
                    {
                        v.x += s_tileUpXY[src0 +  0 + step] * (scalar_t)c_fd[step];
                        v.y += s_tileUpXY[src0 +  4 + step] * (scalar_t)c_fd[step];
                        v.z += s_tileUpXY[src0 +  8 + step] * (scalar_t)c_fd[step];
                        v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];
                    }
                    s_tileDownX[idx+0] = v.x;
                    s_tileDownX[idx+1] = v.y;
                    s_tileDownX[idx+2] = v.z;
                    s_tileDownX[idx+3] = v.w;
                }
            }
            else if ((down == 2 || down == 4) && (tileOutW % 2 == 0))
            {
                // Calculate 2 pixels at a time.
                for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH; idx += blockDim.x * 2)
                {
                    int relOutX0, relUpY;
                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
                    int relUpX0 = relOutX0 * down;
                    int src0 = relUpY * tileUpW + relUpX0;
                    vec2_t v = InternalType<T>::zero_vec2();
                    #pragma unroll
                    for (int step = 0; step < fdSize; step++)
                    {
                        v.x += s_tileUpXY[src0 +    0 + step] * (scalar_t)c_fd[step];
                        v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];
                    }
                    s_tileDownX[idx+0] = v.x;
                    s_tileDownX[idx+1] = v.y;
                }
            }
            else
            {
                // Calculate 1 pixel at a time.
                for (int idx = threadIdx.x; idx < tileOutW * tileUpH; idx += blockDim.x)
                {
                    int relOutX0, relUpY;
                    fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
                    int relUpX0 = relOutX0 * down;
                    int src = relUpY * tileUpW + relUpX0;
                    scalar_t v = 0.f;
                    #pragma unroll
                    for (int step = 0; step < fdSize; step++)
                        v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];
                    s_tileDownX[idx] = v;
                }
            }

            // Vertical downsampling & store output tile.
            __syncthreads();
            for (int idx = threadIdx.x; idx < tileOutW * tileOutH; idx += blockDim.x)
            {
                int relOutX, relOutY0;
                fast_div_mod<tileOutW>(relOutX, relOutY0, idx);
                int relUpY0 = relOutY0 * down;
                int src0 = relUpY0 * tileOutW + relOutX;
                scalar_t v = 0;
                #pragma unroll
                for (int step = 0; step < fdSize; step++)
                    v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];

                int outX = tileOutX + relOutX;
                int outY = tileOutY + relOutY0;

                if (outX < p.yShape.x & outY < p.yShape.y)
                    *((T*)((char*)p.y + (outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)v;
            }
        }
        else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD)
        {
            // Full downsampling filter.
            if (down == 2)
            {
                // 2-wide.
                __syncthreads();
                for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH; idx += blockDim.x * 2)
                {
                    int relOutX0, relOutY0;
                    fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
                    int relUpX0 = relOutX0 * down;
                    int relUpY0 = relOutY0 * down;
                    int src0 = relUpY0 * tileUpW + relUpX0;
                    vec2_t v = InternalType<T>::zero_vec2();
                    #pragma unroll
                    for (int sy = 0; sy < fdSize; sy++)
                    #pragma unroll
                    for (int sx = 0; sx < fdSize; sx++)
                    {
                        v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] * (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
                        v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] * (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
                    }

                    int outX = tileOutX + relOutX0;
                    int outY = tileOutY + relOutY0;
                    if ((uint32_t)outY < p.yShape.y)
                    {
                        index_t ofs = outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;
                        if (outX + 0 < p.yShape.x) *((T*)((char*)p.y + ofs)) = (T)v.x;
                        if (outX + 1 < p.yShape.x) *((T*)((char*)p.y + ofs + get_stride<index_t>(p.yStride.x))) = (T)v.y;
                    }
                }
            }
            else if (down == 1 && !downInline)
            {
                // Thread per pixel.
                __syncthreads();
                for (int idx = threadIdx.x; idx < tileOutW * tileOutH; idx += blockDim.x)
                {
                    int relOutX0, relOutY0;
                    fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
                    scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0]; // 1x1 filter.

                    int outX = tileOutX + relOutX0;
                    int outY = tileOutY + relOutY0;
                    if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)
                        *((T*)((char*)p.y + (outX * get_stride<index_t>(p.yStride.x) + outY * get_stride<index_t>(p.yStride.y) + mapOfsOut))) = (T)v;
                }
            }
        }

        if (!enableXrep)
            break;
    }
}

//------------------------------------------------------------------------
// Compute activation function and signs for upsampled data tensor, modifying data tensor in-place. Used for accelerating the generic variant.
// Sign tensor is known to be contiguous, and p.x and p.s have the same z, w dimensions. 64-bit indexing is always used.

template <class T, bool signWrite, bool signRead>
static __global__ void filtered_lrelu_act_kernel(filtered_lrelu_act_kernel_params p)
{
    typedef typename InternalType<T>::scalar_t scalar_t;

    // Indexing.
    int32_t x = threadIdx.x + blockIdx.x * blockDim.x;
    int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;
    int32_t qmax = p.xShape.z * p.xShape.w; // Combined minibatch*channel maximum index.

    // Loop to accommodate oversized tensors.
    for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)
    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y)
    {
        // Extract z and w (channel, minibatch index).
        int32_t w = q / p.xShape.z;
        int32_t z = q - w * p.xShape.z;

        // Choose behavior based on sign read/write mode.
        if (signWrite)
        {
            // Process value if in p.x.
            uint32_t s = 0;
            if (x < p.xShape.x && y < p.xShape.y)
            {
                int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z + w * p.xStride.w;
                T* pv = ((T*)p.x) + ix;
                scalar_t v = (scalar_t)(*pv);

                // Gain, LReLU, clamp.
                v *= p.gain;
                if (v < 0.f)
                {
                    v *= p.slope;
                    s = 1; // Sign.
                }
                if (fabsf(v) > p.clamp)
                {
                    v = InternalType<T>::clamp(v, p.clamp);
                    s = 2; // Clamp.
                }

                *pv = (T)v; // Write value.
            }

            // Coalesce into threads 0 and 16 of warp.
            uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;
            s <<= ((threadIdx.x & 15) << 1); // Shift into place.
            s |= __shfl_xor_sync(m, s, 1); // Distribute.
            s |= __shfl_xor_sync(m, s, 2);
            s |= __shfl_x

Download .txt

gitextract_wzfbvs4n/

├── .idea/
│   ├── CCSR.iml
│   ├── inspectionProfiles/
│   │   ├── Project_Default.xml
│   │   └── profiles_settings.xml
│   ├── modules.xml
│   ├── vcs.xml
│   └── workspace.xml
├── ADD/
│   ├── dnnlib/
│   │   ├── __init__.py
│   │   └── util.py
│   ├── layers/
│   │   ├── __init__.py
│   │   ├── attention.py
│   │   ├── block.py
│   │   ├── dino_head.py
│   │   ├── drop_path.py
│   │   ├── layer_scale.py
│   │   ├── mlp.py
│   │   ├── patch_embed.py
│   │   └── swiglu_ffn.py
│   ├── models/
│   │   ├── discriminator.py
│   │   └── vit.py
│   ├── th_utils/
│   │   ├── __init__.py
│   │   ├── custom_ops.py
│   │   ├── misc.py
│   │   └── ops/
│   │       ├── __init__.py
│   │       ├── bias_act.cpp
│   │       ├── bias_act.cu
│   │       ├── bias_act.h
│   │       ├── bias_act.py
│   │       ├── conv2d_gradfix.py
│   │       ├── conv2d_resample.py
│   │       ├── filtered_lrelu.cpp
│   │       ├── filtered_lrelu.cu
│   │       ├── filtered_lrelu.h
│   │       ├── filtered_lrelu.py
│   │       ├── filtered_lrelu_ns.cu
│   │       ├── filtered_lrelu_rd.cu
│   │       ├── filtered_lrelu_wr.cu
│   │       ├── fma.py
│   │       ├── grid_sample_gradfix.py
│   │       ├── upfirdn2d.cpp
│   │       ├── upfirdn2d.cu
│   │       ├── upfirdn2d.h
│   │       └── upfirdn2d.py
│   └── utils/
│       └── util_net.py
├── LICENSE
├── README.md
├── dataloaders/
│   ├── paired_dataset_txt.py
│   ├── params_ccsr.yml
│   └── realesrgan.py
├── models/
│   ├── DiffAugment.py
│   ├── controlnet.py
│   ├── losses/
│   │   ├── __init__.py
│   │   ├── contperceptual.py
│   │   └── vqperceptual.py
│   ├── shared.py
│   ├── unet_2d_blocks.py
│   ├── unet_2d_condition.py
│   └── vit_utils.py
├── myutils/
│   ├── devices.py
│   ├── img_util.py
│   ├── misc.py
│   ├── vaehook.py
│   └── wavelet_color_fix.py
├── pipelines/
│   └── pipeline_ccsr.py
├── requirements.txt
├── scripts/
│   ├── get_path.py
│   ├── test/
│   │   ├── test_ccsr_multistep.sh
│   │   ├── test_ccsr_onestep.sh
│   │   └── test_ccsr_tile.sh
│   └── train/
│       ├── train_ccsr_stage1.sh
│       ├── train_ccsr_stage2.sh
│       └── train_controlnet.sh
├── test_ccsr_tile.py
├── train_ccsr_stage1.py
├── train_ccsr_stage2.py
├── train_controlnet.py
└── utils/
    ├── devices.py
    ├── img_util.py
    ├── misc.py
    ├── vaehook.py
    └── wavelet_color_fix.py

Download .txt

SYMBOL INDEX (535 symbols across 53 files)

FILE: ADD/dnnlib/util.py
  class EasyDict (line 39) | class EasyDict(dict):
    method __getattr__ (line 42) | def __getattr__(self, name: str) -> Any:
    method __setattr__ (line 48) | def __setattr__(self, name: str, value: Any) -> None:
    method __delattr__ (line 51) | def __delattr__(self, name: str) -> None:
  class Logger (line 55) | class Logger(object):
    method __init__ (line 58) | def __init__(self, file_name: Optional[str] = None, file_mode: str = "...
    method __enter__ (line 71) | def __enter__(self) -> "Logger":
    method __exit__ (line 74) | def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> N...
    method write (line 77) | def write(self, text: Union[str, bytes]) -> None:
    method flush (line 92) | def flush(self) -> None:
    method close (line 99) | def close(self) -> None:
  function set_cache_dir (line 119) | def set_cache_dir(path: str) -> None:
  function make_cache_dir_path (line 124) | def make_cache_dir_path(*paths: str) -> str:
  function format_time (line 139) | def format_time(seconds: Union[int, float]) -> str:
  function format_time_brief (line 153) | def format_time_brief(seconds: Union[int, float]) -> str:
  function ask_yes_no (line 167) | def ask_yes_no(question: str) -> bool:
  function tuple_product (line 177) | def tuple_product(t: Tuple) -> Any:
  function get_dtype_and_ctype (line 201) | def get_dtype_and_ctype(type_obj: Any) -> Tuple[np.dtype, Any]:
  function is_pickleable (line 224) | def is_pickleable(obj: Any) -> bool:
  function get_module_from_obj_name (line 236) | def get_module_from_obj_name(obj_name: str) -> Tuple[types.ModuleType, s...
  function get_obj_from_module (line 277) | def get_obj_from_module(module: types.ModuleType, obj_name: str) -> Any:
  function get_obj_by_name (line 287) | def get_obj_by_name(name: str) -> Any:
  function call_func_by_name (line 293) | def call_func_by_name(*args, func_name: str = None, **kwargs) -> Any:
  function construct_class_by_name (line 301) | def construct_class_by_name(*args, class_name: str = None, **kwargs) -> ...
  function get_module_dir_by_obj_name (line 306) | def get_module_dir_by_obj_name(obj_name: str) -> str:
  function is_top_level_function (line 312) | def is_top_level_function(obj: Any) -> bool:
  function get_top_level_function_name (line 317) | def get_top_level_function_name(obj: Any) -> str:
  function list_dir_recursively_with_ignore (line 329) | def list_dir_recursively_with_ignore(dir_path: str, ignores: List[str] =...
  function copy_files_and_create_dirs (line 362) | def copy_files_and_create_dirs(files: List[Tuple[str, str]]) -> None:
  function is_url (line 378) | def is_url(obj: Any, allow_file_urls: bool = False) -> bool:
  function open_url (line 396) | def open_url(url: str, cache_dir: str = None, num_attempts: int = 10, ve...

FILE: ADD/layers/attention.py
  class Attention (line 36) | class Attention(nn.Module):
    method __init__ (line 37) | def __init__(
    method forward (line 56) | def forward(self, x: Tensor) -> Tensor:
  class MemEffAttention (line 72) | class MemEffAttention(Attention):
    method forward (line 73) | def forward(self, x: Tensor, attn_bias=None) -> Tensor:

FILE: ADD/layers/block.py
  class Block (line 43) | class Block(nn.Module):
    method __init__ (line 44) | def __init__(
    method forward (line 89) | def forward(self, x: Tensor) -> Tensor:
  function drop_add_residual_stochastic_depth (line 117) | def drop_add_residual_stochastic_depth(
  function get_branges_scales (line 141) | def get_branges_scales(x, sample_drop_ratio=0.0):
  function add_residual (line 149) | def add_residual(x, brange, residual, residual_scale_factor, scaling_vec...
  function get_attn_bias_and_cat (line 164) | def get_attn_bias_and_cat(x_list, branges=None):
  function drop_add_residual_stochastic_depth_list (line 188) | def drop_add_residual_stochastic_depth_list(
  class NestedTensorBlock (line 211) | class NestedTensorBlock(Block):
    method forward_nested (line 212) | def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
    method forward (line 252) | def forward(self, x_or_x_list):

FILE: ADD/layers/dino_head.py
  class DINOHead (line 12) | class DINOHead(nn.Module):
    method __init__ (line 13) | def __init__(
    method _init_weights (line 30) | def _init_weights(self, m):
    method forward (line 36) | def forward(self, x):
  function _build_mlp (line 44) | def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=...

FILE: ADD/layers/drop_path.py
  function drop_path (line 14) | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
  class DropPath (line 26) | class DropPath(nn.Module):
    method __init__ (line 29) | def __init__(self, drop_prob=None):
    method forward (line 33) | def forward(self, x):

FILE: ADD/layers/layer_scale.py
  class LayerScale (line 15) | class LayerScale(nn.Module):
    method __init__ (line 16) | def __init__(
    method forward (line 26) | def forward(self, x: Tensor) -> Tensor:

FILE: ADD/layers/mlp.py
  class Mlp (line 16) | class Mlp(nn.Module):
    method __init__ (line 17) | def __init__(
    method forward (line 34) | def forward(self, x: Tensor) -> Tensor:

FILE: ADD/layers/patch_embed.py
  function make_2tuple (line 16) | def make_2tuple(x):
  class PatchEmbed (line 25) | class PatchEmbed(nn.Module):
    method __init__ (line 37) | def __init__(
    method forward (line 68) | def forward(self, x: Tensor) -> Tensor:

FILE: ADD/layers/swiglu_ffn.py
  class SwiGLUFFN (line 14) | class SwiGLUFFN(nn.Module):
    method __init__ (line 15) | def __init__(
    method forward (line 30) | def forward(self, x: Tensor) -> Tensor:
  class SwiGLUFFNFused (line 54) | class SwiGLUFFNFused(SwiGLU):
    method __init__ (line 55) | def __init__(

FILE: ADD/models/discriminator.py
  class SpectralConv1d (line 31) | class SpectralConv1d(nn.Conv1d):
    method __init__ (line 32) | def __init__(self, *args, **kwargs):
  class BatchNormLocal (line 37) | class BatchNormLocal(nn.Module):
    method __init__ (line 38) | def __init__(self, num_features: int, affine: bool = True, virtual_bs:...
    method forward (line 48) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  function make_block (line 66) | def make_block(channels: int, kernel_size: int) -> nn.Module:
  class DiscHead (line 80) | class DiscHead(nn.Module):
    method __init__ (line 81) | def __init__(self, channels: int, c_dim: int, cmap_dim: int = 64):
    method forward (line 98) | def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
  class DINO (line 108) | class DINO(torch.nn.Module):
    method __init__ (line 109) | def __init__(self, hooks: list[int] = [2,5,8,11], hook_patch: bool = T...
    method forward (line 125) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class ProjectedDiscriminator (line 133) | class ProjectedDiscriminator(nn.Module):
    method __init__ (line 134) | def __init__(self, c_dim: int, diffaug: bool = True, p_crop: float = 0...
    method train (line 147) | def train(self, mode: bool = True):
    method eval (line 152) | def eval(self):
    method forward (line 155) | def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:

FILE: ADD/models/vit.py
  function named_apply (line 26) | def named_apply(fn: Callable, module: nn.Module, name="", depth_first=Tr...
  class BlockChunk (line 37) | class BlockChunk(nn.ModuleList):
    method forward (line 38) | def forward(self, x):
  class DinoVisionTransformer (line 44) | class DinoVisionTransformer(nn.Module):
    method __init__ (line 45) | def __init__(
    method init_weights (line 172) | def init_weights(self):
    method interpolate_pos_encoding (line 179) | def interpolate_pos_encoding(self, x, w, h):
    method prepare_tokens_with_masks (line 209) | def prepare_tokens_with_masks(self, x, masks=None):
    method forward_features_list (line 230) | def forward_features_list(self, x_list, masks_list):
    method forward_features (line 250) | def forward_features(self, x, masks=None):
    method _get_intermediate_layers_not_chunked (line 268) | def _get_intermediate_layers_not_chunked(self, x, n=1):
    method _get_intermediate_layers_chunked (line 280) | def _get_intermediate_layers_chunked(self, x, n=1):
    method get_intermediate_layers (line 294) | def get_intermediate_layers(
    method forward (line 320) | def forward(self, *args, is_training=False, **kwargs):
  function init_weights_vit_timm (line 328) | def init_weights_vit_timm(module: nn.Module, name: str = ""):
  function vit_small (line 336) | def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
  function vit_large (line 349) | def vit_large(patch_size=16, num_register_tokens=0, **kwargs):

FILE: ADD/th_utils/custom_ops.py
  function _find_compiler_bindir (line 29) | def _find_compiler_bindir():
  function _get_mangled_gpu_name (line 44) | def _get_mangled_gpu_name():
  function get_plugin (line 59) | def get_plugin(module_name, sources, headers=None, source_dir=None, **bu...

FILE: ADD/th_utils/misc.py
  function constant (line 22) | def constant(value, shape=None, dtype=None, device=None, memory_format=N...
  function nan_to_num (line 49) | def nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None): #...
  function suppress_tracer_warnings (line 71) | def suppress_tracer_warnings():
  function assert_shape (line 82) | def assert_shape(tensor, ref_shape):
  function profiled_function (line 100) | def profiled_function(fn):
  class InfiniteSampler (line 111) | class InfiniteSampler(torch.utils.data.Sampler):
    method __init__ (line 112) | def __init__(self, dataset, rank=0, num_replicas=1, shuffle=True, seed...
    method __iter__ (line 125) | def __iter__(self):
  function spectral_to_cpu (line 146) | def spectral_to_cpu(model: torch.nn.Module):
  function get_children (line 154) | def get_children(model: torch.nn.Module):
  function params_and_buffers (line 167) | def params_and_buffers(module):
  function named_params_and_buffers (line 171) | def named_params_and_buffers(module):
  function copy_params_and_buffers (line 175) | def copy_params_and_buffers(src_module, dst_module, require_all=False):
  function ddp_sync (line 189) | def ddp_sync(module, sync):
  function check_ddp_consistency (line 200) | def check_ddp_consistency(module, ignore_regex=None):
  function print_module_summary (line 216) | def print_module_summary(module, inputs, max_nesting=3, skip_redundant=T...

FILE: ADD/th_utils/ops/bias_act.cpp
  function has_same_layout (line 16) | static bool has_same_layout(torch::Tensor x, torch::Tensor y)
  function bias_act (line 32) | static torch::Tensor bias_act(torch::Tensor x, torch::Tensor b, torch::T...
  function PYBIND11_MODULE (line 94) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: ADD/th_utils/ops/bias_act.h
  type bias_act_kernel_params (line 12) | struct bias_act_kernel_params

FILE: ADD/th_utils/ops/bias_act.py
  function _init (line 38) | def _init():
  function bias_act (line 52) | def bias_act(x, b=None, dim=1, act='linear', alpha=None, gain=None, clam...
  function _bias_act_ref (line 91) | def _bias_act_ref(x, b=None, dim=1, act='linear', alpha=None, gain=None,...
  function _bias_act_cuda (line 126) | def _bias_act_cuda(dim=1, act='linear', alpha=None, gain=None, clamp=None):

FILE: ADD/th_utils/ops/conv2d_gradfix.py
  function no_weight_gradients (line 27) | def no_weight_gradients(disable=True):
  function conv2d (line 37) | def conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, gr...
  function conv_transpose2d (line 42) | def conv_transpose2d(input, weight, bias=None, stride=1, padding=0, outp...
  function _should_use_custom_op (line 49) | def _should_use_custom_op(input):
  function _tuple_of_ints (line 60) | def _tuple_of_ints(xs, ndim):
  function _conv2d_gradfix (line 71) | def _conv2d_gradfix(transpose, weight_shape, stride, padding, output_pad...

FILE: ADD/th_utils/ops/conv2d_resample.py
  function _get_weight_shape (line 21) | def _get_weight_shape(w):
  function _conv2d_wrapper (line 29) | def _conv2d_wrapper(x, w, stride=1, padding=0, groups=1, transpose=False...
  function conv2d_resample (line 46) | def conv2d_resample(x, w, f=None, up=1, down=1, padding=0, groups=1, fli...

FILE: ADD/th_utils/ops/filtered_lrelu.cpp
  function filtered_lrelu (line 16) | static std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
  function filtered_lrelu_act (line 213) | static torch::Tensor filtered_lrelu_act(torch::Tensor x, torch::Tensor s...
  function PYBIND11_MODULE (line 294) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: ADD/th_utils/ops/filtered_lrelu.h
  type filtered_lrelu_kernel_params (line 14) | struct filtered_lrelu_kernel_params
  type filtered_lrelu_act_kernel_params (line 55) | struct filtered_lrelu_act_kernel_params
  type filtered_lrelu_kernel_spec (line 73) | struct filtered_lrelu_kernel_spec

FILE: ADD/th_utils/ops/filtered_lrelu.py
  function _init (line 23) | def _init():
  function _get_filter_size (line 35) | def _get_filter_size(f):
  function _parse_padding (line 42) | def _parse_padding(padding):
  function filtered_lrelu (line 56) | def filtered_lrelu(x, fu=None, fd=None, b=None, up=1, down=1, padding=0,...
  function _filtered_lrelu_ref (line 121) | def _filtered_lrelu_ref(x, fu=None, fd=None, b=None, up=1, down=1, paddi...
  function _filtered_lrelu_cuda (line 159) | def _filtered_lrelu_cuda(up=1, down=1, padding=0, gain=np.sqrt(2), slope...

FILE: ADD/th_utils/ops/fma.py
  function fma (line 15) | def fma(a, b, c): # => a * b + c
  class _FusedMultiplyAdd (line 20) | class _FusedMultiplyAdd(torch.autograd.Function): # a * b + c
    method forward (line 22) | def forward(ctx, a, b, c): # pylint: disable=arguments-differ
    method backward (line 29) | def backward(ctx, dout): # pylint: disable=arguments-differ
  function _unbroadcast (line 49) | def _unbroadcast(x, shape):

FILE: ADD/th_utils/ops/grid_sample_gradfix.py
  function grid_sample (line 28) | def grid_sample(input, grid):
  function _should_use_custom_op (line 35) | def _should_use_custom_op():
  class _GridSample2dForward (line 40) | class _GridSample2dForward(torch.autograd.Function):
    method forward (line 42) | def forward(ctx, input, grid):
    method backward (line 50) | def backward(ctx, grad_output):
  class _GridSample2dBackward (line 57) | class _GridSample2dBackward(torch.autograd.Function):
    method forward (line 59) | def forward(ctx, grad_output, input, grid):
    method backward (line 70) | def backward(ctx, grad2_grad_input, grad2_grad_grid):

FILE: ADD/th_utils/ops/upfirdn2d.cpp
  function upfirdn2d (line 16) | static torch::Tensor upfirdn2d(torch::Tensor x, torch::Tensor f, int upx...
  function PYBIND11_MODULE (line 102) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)

FILE: ADD/th_utils/ops/upfirdn2d.h
  type upfirdn2d_kernel_params (line 14) | struct upfirdn2d_kernel_params
  type upfirdn2d_kernel_spec (line 45) | struct upfirdn2d_kernel_spec

FILE: ADD/th_utils/ops/upfirdn2d.py
  function _init (line 23) | def _init():
  function _parse_scaling (line 35) | def _parse_scaling(scaling):
  function _parse_padding (line 44) | def _parse_padding(padding):
  function _get_filter_size (line 55) | def _get_filter_size(f):
  function setup_filter (line 70) | def setup_filter(f, device=torch.device('cpu'), normalize=True, flip_fil...
  function upfirdn2d (line 118) | def upfirdn2d(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1, ...
  function _upfirdn2d_ref (line 167) | def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gai...
  function _upfirdn2d_cuda (line 217) | def _upfirdn2d_cuda(up=1, down=1, padding=0, flip_filter=False, gain=1):
  function filter2d (line 277) | def filter2d(x, f, padding=0, flip_filter=False, gain=1, impl='cuda'):
  function upsample2d (line 313) | def upsample2d(x, f, up=2, padding=0, flip_filter=False, gain=1, impl='c...
  function downsample2d (line 352) | def downsample2d(x, f, down=2, padding=0, flip_filter=False, gain=1, imp...

FILE: ADD/utils/util_net.py
  function calculate_parameters (line 12) | def calculate_parameters(net):
  function pad_input (line 18) | def pad_input(x, mod):
  function forward_chop (line 25) | def forward_chop(net, x, net_kwargs=None, scale=1, shave=10, min_size=16...
  function measure_time (line 68) | def measure_time(net, inputs, num_forward=100):
  function reload_model (line 86) | def reload_model(model, ckpt):
  function compute_hinge_loss (line 99) | def compute_hinge_loss(real_output, fake_output, x_start_, r1_lambda):
  function reload_model_ (line 125) | def reload_model_(model, ckpt):
  function reload_model_IDE (line 140) | def reload_model_IDE(model, ckpt):
  class EMA (line 151) | class EMA():
    method __init__ (line 152) | def __init__(self, model, decay):
    method register (line 158) | def register(self):
    method update (line 163) | def update(self):
    method apply_shadow (line 170) | def apply_shadow(self):
    method restore (line 177) | def restore(self):

FILE: dataloaders/paired_dataset_txt.py
  class PairedCaptionDataset (line 14) | class PairedCaptionDataset(data.Dataset):
    method __init__ (line 15) | def __init__(
    method tokenize_caption (line 37) | def tokenize_caption(self, caption=""):
    method __getitem__ (line 44) | def __getitem__(self, index):
    method __len__ (line 69) | def __len__(self):

FILE: dataloaders/realesrgan.py
  function ordered_yaml (line 23) | def ordered_yaml():
  function opt_parse (line 47) | def opt_parse(opt_path):
  class RealESRGAN_degradation (line 54) | class RealESRGAN_degradation(object):
    method __init__ (line 55) | def __init__(self, opt_path='', device='cpu'):
    method color_jitter_pt (line 88) | def color_jitter_pt(self, img, brightness, contrast, saturation, hue):
    method random_augment (line 108) | def random_augment(self, img_gt):
    method random_kernels (line 129) | def random_kernels(self):
    method degrade_process (line 191) | def degrade_process(self, img_gt, resize_bak=False):

FILE: models/DiffAugment.py
  function DiffAugment (line 35) | def DiffAugment(x: torch.Tensor, policy: str = '', channels_first: bool ...
  function rand_brightness (line 48) | def rand_brightness(x: torch.Tensor) -> torch.Tensor:
  function rand_saturation (line 53) | def rand_saturation(x: torch.Tensor) -> torch.Tensor:
  function rand_contrast (line 59) | def rand_contrast(x: torch.Tensor) -> torch.Tensor:
  function rand_translation (line 65) | def rand_translation(x: torch.Tensor, ratio: float = 0.125) -> torch.Ten...
  function rand_cutout (line 81) | def rand_cutout(x: torch.Tensor, ratio: float = 0.2) -> torch.Tensor:
  function rand_resize (line 98) | def rand_resize(x: torch.Tensor, min_ratio: float = 0.8, max_ratio: floa...

FILE: models/controlnet.py
  class ControlNetOutput (line 40) | class ControlNetOutput(BaseOutput):
  class ControlNetConditioningEmbedding (line 59) | class ControlNetConditioningEmbedding(nn.Module):
    method __init__ (line 69) | def __init__(
    method forward (line 91) | def forward(self, conditioning):
  class ControlNetModel (line 104) | class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMix...
    method __init__ (line 173) | def __init__(
    method from_unet (line 434) | def from_unet(
    method attn_processors (line 507) | def attn_processors(self) -> Dict[str, AttentionProcessor]:
    method set_attn_processor (line 531) | def set_attn_processor(self, processor: Union[AttentionProcessor, Dict...
    method set_default_attn_processor (line 566) | def set_default_attn_processor(self):
    method set_attention_slice (line 573) | def set_attention_slice(self, slice_size):
    method _set_gradient_checkpointing (line 638) | def _set_gradient_checkpointing(self, module, value=False):
    method forward (line 642) | def forward(
  function zero_module (line 845) | def zero_module(module):

FILE: models/losses/contperceptual.py
  class LPIPSWithDiscriminator (line 9) | class LPIPSWithDiscriminator(ModelMixin, ConfigMixin, FromOriginalContro...
    method __init__ (line 10) | def __init__(self, disc_start, logvar_init=0.0, kl_weight=1.0, pixello...
    method calculate_adaptive_weight (line 34) | def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
    method forward (line 47) | def forward(self, inputs, reconstructions, optimizer_idx,

FILE: models/losses/vqperceptual.py
  function hinge_d_loss_with_exemplar_weights (line 11) | def hinge_d_loss_with_exemplar_weights(logits_real, logits_fake, weights):
  function adopt_weight (line 20) | def adopt_weight(weight, global_step, threshold=0, value=0.):
  function measure_perplexity (line 26) | def measure_perplexity(predicted_indices, n_embed):
  function l1 (line 35) | def l1(x, y):
  function l2 (line 39) | def l2(x, y):
  class VQLPIPSWithDiscriminator (line 43) | class VQLPIPSWithDiscriminator(nn.Module):
    method __init__ (line 44) | def __init__(self, disc_start, codebook_weight=1.0, pixelloss_weight=1.0,
    method calculate_adaptive_weight (line 98) | def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None):
    method forward (line 111) | def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx,

FILE: models/shared.py
  class ResidualBlock (line 20) | class ResidualBlock(nn.Module):
    method __init__ (line 21) | def __init__(self, fn: Callable):
    method forward (line 25) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class FullyConnectedLayer (line 29) | class FullyConnectedLayer(nn.Module):
    method __init__ (line 30) | def __init__(
    method forward (line 51) | def forward(self, x: torch.Tensor) -> torch.Tensor:
    method extra_repr (line 66) | def extra_repr(self) -> str:
  class MLP (line 70) | class MLP(nn.Module):
    method __init__ (line 71) | def __init__(
    method forward (line 91) | def forward(self, x: torch.Tensor) -> torch.Tensor:

FILE: models/unet_2d_blocks.py
  function get_down_block (line 33) | def get_down_block(
  function get_up_block (line 230) | def get_up_block(
  class AutoencoderTinyBlock (line 431) | class AutoencoderTinyBlock(nn.Module):
    method __init__ (line 432) | def __init__(self, in_channels: int, out_channels: int, act_fn: str):
    method forward (line 449) | def forward(self, x):
  class UNetMidBlock2D (line 453) | class UNetMidBlock2D(nn.Module):
    method __init__ (line 454) | def __init__(
    method forward (line 534) | def forward(self, hidden_states, temb=None):
  class UNetMidBlock2DCrossAttn (line 544) | class UNetMidBlock2DCrossAttn(nn.Module):
    method __init__ (line 545) | def __init__(
    method forward (line 636) | def forward(
  class UNetMidBlock2DSimpleCrossAttn (line 689) | class UNetMidBlock2DSimpleCrossAttn(nn.Module):
    method __init__ (line 690) | def __init__(
    method forward (line 774) | def forward(
  class AttnDownBlock2D (line 812) | class AttnDownBlock2D(nn.Module):
    method __init__ (line 813) | def __init__(
    method forward (line 904) | def forward(self, hidden_states, temb=None, upsample_size=None):
  class CrossAttnDownBlock2D (line 924) | class CrossAttnDownBlock2D(nn.Module):
    method __init__ (line 925) | def __init__(
    method forward (line 1017) | def forward(
  class DownBlock2D (line 1084) | class DownBlock2D(nn.Module):
    method __init__ (line 1085) | def __init__(
    method forward (line 1136) | def forward(self, hidden_states, temb=None):
  class DownEncoderBlock2D (line 1170) | class DownEncoderBlock2D(nn.Module):
    method __init__ (line 1171) | def __init__(
    method forward (line 1219) | def forward(self, hidden_states):
  class AttnDownEncoderBlock2D (line 1230) | class AttnDownEncoderBlock2D(nn.Module):
    method __init__ (line 1231) | def __init__(
    method forward (line 1302) | def forward(self, hidden_states):
  class AttnSkipDownBlock2D (line 1314) | class AttnSkipDownBlock2D(nn.Module):
    method __init__ (line 1315) | def __init__(
    method forward (line 1395) | def forward(self, hidden_states, temb=None, skip_sample=None):
  class SkipDownBlock2D (line 1415) | class SkipDownBlock2D(nn.Module):
    method __init__ (line 1416) | def __init__(
    method forward (line 1475) | def forward(self, hidden_states, temb=None, skip_sample=None):
  class ResnetDownsampleBlock2D (line 1494) | class ResnetDownsampleBlock2D(nn.Module):
    method __init__ (line 1495) | def __init__(
    method forward (line 1558) | def forward(self, hidden_states, temb=None):
  class SimpleCrossAttnDownBlock2D (line 1592) | class SimpleCrossAttnDownBlock2D(nn.Module):
    method __init__ (line 1593) | def __init__(
    method forward (line 1687) | def forward(
  class KDownBlock2D (line 1750) | class KDownBlock2D(nn.Module):
    method __init__ (line 1751) | def __init__(
    method forward (line 1796) | def forward(self, hidden_states, temb=None):
  class KCrossAttnDownBlock2D (line 1828) | class KCrossAttnDownBlock2D(nn.Module):
    method __init__ (line 1829) | def __init__(
    method forward (line 1893) | def forward(
  class AttnUpBlock2D (line 1954) | class AttnUpBlock2D(nn.Module):
    method __init__ (line 1955) | def __init__(
    method forward (line 2043) | def forward(self, hidden_states, res_hidden_states_tuple, temb=None, u...
  class CrossAttnUpBlock2D (line 2063) | class CrossAttnUpBlock2D(nn.Module):
    method __init__ (line 2064) | def __init__(
    method forward (line 2152) | def forward(
  class UpBlock2D (line 2215) | class UpBlock2D(nn.Module):
    method __init__ (line 2216) | def __init__(
    method forward (line 2263) | def forward(self, hidden_states, res_hidden_states_tuple, temb=None, u...
  class UpDecoderBlock2D (line 2296) | class UpDecoderBlock2D(nn.Module):
    method __init__ (line 2297) | def __init__(
    method forward (line 2340) | def forward(self, hidden_states, temb=None):
  class AttnUpDecoderBlock2D (line 2351) | class AttnUpDecoderBlock2D(nn.Module):
    method __init__ (line 2352) | def __init__(
    method forward (line 2419) | def forward(self, hidden_states, temb=None):
  class AttnSkipUpBlock2D (line 2431) | class AttnSkipUpBlock2D(nn.Module):
    method __init__ (line 2432) | def __init__(
    method forward (line 2522) | def forward(self, hidden_states, res_hidden_states_tuple, temb=None, s...
  class SkipUpBlock2D (line 2550) | class SkipUpBlock2D(nn.Module):
    method __init__ (line 2551) | def __init__(
    method forward (line 2619) | def forward(self, hidden_states, res_hidden_states_tuple, temb=None, s...
  class ResnetUpsampleBlock2D (line 2645) | class ResnetUpsampleBlock2D(nn.Module):
    method __init__ (line 2646) | def __init__(
    method forward (line 2712) | def forward(self, hidden_states, res_hidden_states_tuple, temb=None, u...
  class SimpleCrossAttnUpBlock2D (line 2745) | class SimpleCrossAttnUpBlock2D(nn.Module):
    method __init__ (line 2746) | def __init__(
    method forward (line 2842) | def forward(
  class KUpBlock2D (line 2908) | class KUpBlock2D(nn.Module):
    method __init__ (line 2909) | def __init__(
    method forward (line 2956) | def forward(self, hidden_states, res_hidden_states_tuple, temb=None, u...
  class KCrossAttnUpBlock2D (line 2988) | class KCrossAttnUpBlock2D(nn.Module):
    method __init__ (line 2989) | def __init__(
    method forward (line 3072) | def forward(
  class KAttentionBlock (line 3133) | class KAttentionBlock(nn.Module):
    method __init__ (line 3150) | def __init__(
    method _to_3d (line 3193) | def _to_3d(self, hidden_states, height, weight):
    method _to_4d (line 3196) | def _to_4d(self, hidden_states, height, weight):
    method forward (line 3199) | def forward(

FILE: models/unet_2d_condition.py
  class UNet2DConditionOutput (line 53) | class UNet2DConditionOutput(BaseOutput):
  class UNet2DConditionModel (line 65) | class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoade...
    method __init__ (line 156) | def __init__(
    method attn_processors (line 579) | def attn_processors(self) -> Dict[str, AttentionProcessor]:
    method set_attn_processor (line 602) | def set_attn_processor(self, processor: Union[AttentionProcessor, Dict...
    method set_default_attn_processor (line 636) | def set_default_attn_processor(self):
    method set_attention_slice (line 642) | def set_attention_slice(self, slice_size):
    method _set_gradient_checkpointing (line 707) | def _set_gradient_checkpointing(self, module, value=False):
    method forward (line 711) | def forward(
    method from_pretrained_orig (line 1025) | def from_pretrained_orig(cls, pretrained_model_path, subfolder=None, *...
    method from_pretrained_safetensor (line 1059) | def from_pretrained_safetensor(cls, pretrained_model_path, subfolder=N...

FILE: models/vit_utils.py
  class AddReadout (line 36) | class AddReadout(nn.Module):
    method __init__ (line 37) | def __init__(self, start_index: bool = 1):
    method forward (line 41) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class Transpose (line 49) | class Transpose(nn.Module):
    method __init__ (line 50) | def __init__(self, dim0: int, dim1: int):
    method forward (line 55) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  function forward_vit (line 60) | def forward_vit(pretrained: nn.Module, x: torch.Tensor) -> dict:
  function _resize_pos_embed (line 66) | def _resize_pos_embed(self, posemb: torch.Tensor, gs_h: int, gs_w: int) ...
  function forward_flex (line 83) | def forward_flex(self, x: torch.Tensor) -> torch.Tensor:
  function get_activation (line 111) | def get_activation(name: str) -> Callable:
  function make_sd_backbone (line 117) | def make_sd_backbone(
  function make_vit_backbone (line 150) | def make_vit_backbone(

FILE: myutils/devices.py
  function has_mps (line 12) | def has_mps() -> bool:
  function get_cuda_device_string (line 19) | def get_cuda_device_string():
  function get_optimal_device_name (line 23) | def get_optimal_device_name():
  function get_optimal_device (line 33) | def get_optimal_device():
  function get_device_for (line 37) | def get_device_for(task):
  function torch_gc (line 41) | def torch_gc():
  function enable_tf32 (line 52) | def enable_tf32():
  function cond_cast_unet (line 75) | def cond_cast_unet(input):
  function cond_cast_float (line 79) | def cond_cast_float(input):
  function randn (line 83) | def randn(seed, shape):
  function randn_without_seed (line 88) | def randn_without_seed(shape):
  function autocast (line 92) | def autocast(disable=False):
  function without_autocast (line 99) | def without_autocast(disable=False):
  class NansException (line 103) | class NansException(Exception):
  function test_for_nans (line 107) | def test_for_nans(x, where):
  function first_time_calculation (line 126) | def first_time_calculation():

FILE: myutils/img_util.py
  function save_videos_grid (line 12) | def save_videos_grid(videos, path=None, rescale=True, n_rows=4, fps=8, d...
  function convert_image_to_fn (line 32) | def convert_image_to_fn(img_type, minsize, image, eps=0.02):

FILE: myutils/misc.py
  function rand_name (line 9) | def rand_name(length=8, suffix=''):
  function cycle (line 17) | def cycle(dl):
  function exists (line 22) | def exists(x):
  function identity (line 25) | def identity(x):
  function load_dreambooth_lora (line 28) | def load_dreambooth_lora(unet, vae=None, model_path=None, alpha=1.0, mod...

FILE: myutils/vaehook.py
  function get_recommend_encoder_tile_size (line 83) | def get_recommend_encoder_tile_size():
  function get_recommend_decoder_tile_size (line 100) | def get_recommend_decoder_tile_size():
  function inplace_nonlinearity (line 130) | def inplace_nonlinearity(x):
  function attn_forward_new (line 137) | def attn_forward_new(self, h_):
  function attn_forward (line 173) | def attn_forward(self, h_):
  function xformer_attn_forward (line 199) | def xformer_attn_forward(self, h_):
  function attn2task (line 230) | def attn2task(task_queue, net):
  function resblock2task (line 248) | def resblock2task(queue, block):
  function build_sampling (line 279) | def build_sampling(task_queue, net, is_decoder):
  function build_task_queue (line 336) | def build_task_queue(net, is_decoder):
  function clone_task_queue (line 366) | def clone_task_queue(task_queue):
  function get_var_mean (line 375) | def get_var_mean(input, num_groups, eps=1e-6):
  function custom_group_norm (line 388) | def custom_group_norm(input, num_groups, mean, var, weight=None, bias=No...
  function crop_valid_region (line 420) | def crop_valid_region(x, input_bbox, target_bbox, is_decoder):
  function perfcount (line 436) | def perfcount(fn):
  class GroupNormParam (line 463) | class GroupNormParam:
    method __init__ (line 464) | def __init__(self):
    method add_tile (line 471) | def add_tile(self, tile, layer):
    method summary (line 493) | def summary(self):
    method from_tile (line 515) | def from_tile(tile, norm):
  class VAEHook (line 541) | class VAEHook:
    method __init__ (line 542) | def __init__(self, net, tile_size, is_decoder, fast_decoder, fast_enco...
    method __call__ (line 552) | def __call__(self, x):
    method get_best_tile_size (line 566) | def get_best_tile_size(self, lowerbound, upperbound):
    method split_tiles (line 581) | def split_tiles(self, h, w):
    method estimate_group_norm (line 641) | def estimate_group_norm(self, z, task_queue, color_fix):
    method vae_tile_forward (line 685) | def vae_tile_forward(self, z):

FILE: myutils/wavelet_color_fix.py
  function adain_color_fix (line 14) | def adain_color_fix(target: Image, source: Image):
  function wavelet_color_fix (line 29) | def wavelet_color_fix(target: Image, source: Image):
  function calc_mean_std (line 44) | def calc_mean_std(feat: Tensor, eps=1e-5):
  function adaptive_instance_normalization (line 59) | def adaptive_instance_normalization(content_feat:Tensor, style_feat:Tens...
  function wavelet_blur (line 73) | def wavelet_blur(image: Tensor, radius: int):
  function wavelet_decomposition (line 94) | def wavelet_decomposition(image: Tensor, levels=5):
  function wavelet_reconstruction (line 108) | def wavelet_reconstruction(content_feat:Tensor, style_feat:Tensor):

FILE: pipelines/pipeline_ccsr.py
  class StableDiffusionControlNetPipeline (line 104) | class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInvers...
    method __init__ (line 140) | def __init__(
    method _init_tiled_vae (line 188) | def _init_tiled_vae(self,
    method enable_vae_slicing (line 211) | def enable_vae_slicing(self):
    method disable_vae_slicing (line 221) | def disable_vae_slicing(self):
    method enable_vae_tiling (line 229) | def enable_vae_tiling(self):
    method disable_vae_tiling (line 239) | def disable_vae_tiling(self):
    method enable_sequential_cpu_offload (line 246) | def enable_sequential_cpu_offload(self, gpu_id=0):
    method enable_model_cpu_offload (line 267) | def enable_model_cpu_offload(self, gpu_id=0):
    method _execution_device (line 297) | def _execution_device(self):
    method _encode_prompt (line 315) | def _encode_prompt(
    method run_safety_checker (line 462) | def run_safety_checker(self, image, device, dtype):
    method decode_latents (line 477) | def decode_latents(self, latents):
    method prepare_extra_step_kwargs (line 491) | def prepare_extra_step_kwargs(self, generator, eta):
    method check_inputs (line 509) | def check_inputs(
    method check_image (line 625) | def check_image(self, image, prompt, prompt_embeds):
    method prepare_image (line 657) | def prepare_image(
    method prepare_latents (line 710) | def prepare_latents(self, batch_size, num_channels_latents, height, wi...
    method _default_height_width (line 727) | def _default_height_width(self, height, width, image):
    method save_pretrained (line 753) | def save_pretrained(
    method previous_timestep (line 764) | def previous_timestep(self, timestep):
    method predict_start_from_noise (line 779) | def predict_start_from_noise(self, sample, t, model_output):
    method _sliding_windows (line 808) | def _sliding_windows(self,h: int, w: int, tile_size: int, tile_stride:...
    method _prepare_controlnet_inputs (line 824) | def _prepare_controlnet_inputs(self, latent_model_input, latents, prom...
    method _predict_noise (line 829) | def _predict_noise(self, latent_model_input, t, image, prompt_embeds, ...
    method _unet_predict (line 836) | def _unet_predict(self, latent_model_input, t, image, prompt_embeds, c...
    method _tile_predict (line 851) | def _tile_predict(self, latent_model_input, t, image, prompt_embeds, c...
    method _initial_step (line 881) | def _initial_step(self, do_classifier_free_guidance, latents, t, times...
    method _postprocess_latents (line 893) | def _postprocess_latents(self, latents, output_type, do_denormalize):
    method gaussian_weights (line 902) | def gaussian_weights(self, tile_width: int, tile_height: int, nbatches...
    method __call__ (line 921) | def __call__(

FILE: scripts/get_path.py
  function write_png_paths (line 3) | def write_png_paths(folder_path, txt_path):

FILE: test_ccsr_tile.py
  function load_pipeline (line 35) | def load_pipeline(args, accelerator, enable_xformers_memory_efficient_at...
  function main (line 103) | def main(args, enable_xformers_memory_efficient_attention=True,):

FILE: train_ccsr_stage1.py
  function image_grid (line 66) | def image_grid(imgs, rows, cols):
  function log_validation (line 77) | def log_validation(vae, text_encoder, tokenizer, unet, controlnet, args,...
  function import_model_class_from_model_name_or_path (line 176) | def import_model_class_from_model_name_or_path(pretrained_model_name_or_...
  function save_model_card (line 196) | def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_...
  function parse_args (line 233) | def parse_args(input_args=None):
  function previous_timestep (line 561) | def previous_timestep(timestep):
  function predict_start_from_noise (line 576) | def predict_start_from_noise(sample, t, model_output):
  function save_model_hook (line 680) | def save_model_hook(models, weights, output_dir):
  function load_model_hook (line 688) | def load_model_hook(models, input_dir):

FILE: train_ccsr_stage2.py
  function image_grid (line 72) | def image_grid(imgs, rows, cols):
  function log_validation (line 83) | def log_validation(vae, text_encoder, tokenizer, unet, controlnet, args,...
  function import_model_class_from_model_name_or_path (line 182) | def import_model_class_from_model_name_or_path(pretrained_model_name_or_...
  function save_model_card (line 202) | def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_...
  function parse_args (line 239) | def parse_args(input_args=None):
  function previous_timestep (line 583) | def previous_timestep(timestep):
  function predict_start_from_noise (line 598) | def predict_start_from_noise(sample, t, model_output):
  function save_model_hook (line 727) | def save_model_hook(models, weights, output_dir):
  function load_model_hook (line 737) | def load_model_hook(models, input_dir):

FILE: train_controlnet.py
  function image_grid (line 66) | def image_grid(imgs, rows, cols):
  function log_validation (line 77) | def log_validation(vae, text_encoder, tokenizer, unet, controlnet, args,...
  function import_model_class_from_model_name_or_path (line 176) | def import_model_class_from_model_name_or_path(pretrained_model_name_or_...
  function save_model_card (line 196) | def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_...
  function parse_args (line 233) | def parse_args(input_args=None):
  function previous_timestep (line 563) | def previous_timestep(timestep):
  function predict_start_from_noise (line 578) | def predict_start_from_noise(sample, t, model_output):
  function save_model_hook (line 681) | def save_model_hook(models, weights, output_dir):
  function load_model_hook (line 689) | def load_model_hook(models, input_dir):

FILE: utils/devices.py
  function has_mps (line 12) | def has_mps() -> bool:
  function get_cuda_device_string (line 19) | def get_cuda_device_string():
  function get_optimal_device_name (line 23) | def get_optimal_device_name():
  function get_optimal_device (line 33) | def get_optimal_device():
  function get_device_for (line 37) | def get_device_for(task):
  function torch_gc (line 41) | def torch_gc():
  function enable_tf32 (line 52) | def enable_tf32():
  function cond_cast_unet (line 75) | def cond_cast_unet(input):
  function cond_cast_float (line 79) | def cond_cast_float(input):
  function randn (line 83) | def randn(seed, shape):
  function randn_without_seed (line 88) | def randn_without_seed(shape):
  function autocast (line 92) | def autocast(disable=False):
  function without_autocast (line 99) | def without_autocast(disable=False):
  class NansException (line 103) | class NansException(Exception):
  function test_for_nans (line 107) | def test_for_nans(x, where):
  function first_time_calculation (line 126) | def first_time_calculation():

FILE: utils/img_util.py
  function save_videos_grid (line 12) | def save_videos_grid(videos, path=None, rescale=True, n_rows=4, fps=8, d...
  function convert_image_to_fn (line 32) | def convert_image_to_fn(img_type, minsize, image, eps=0.02):

FILE: utils/misc.py
  function rand_name (line 9) | def rand_name(length=8, suffix=''):
  function cycle (line 17) | def cycle(dl):
  function exists (line 22) | def exists(x):
  function identity (line 25) | def identity(x):
  function load_dreambooth_lora (line 28) | def load_dreambooth_lora(unet, vae=None, model_path=None, alpha=1.0, mod...

FILE: utils/vaehook.py
  function get_recommend_encoder_tile_size (line 82) | def get_recommend_encoder_tile_size():
  function get_recommend_decoder_tile_size (line 99) | def get_recommend_decoder_tile_size():
  function inplace_nonlinearity (line 129) | def inplace_nonlinearity(x):
  function attn_forward_new (line 136) | def attn_forward_new(self, h_):
  function attn_forward (line 172) | def attn_forward(self, h_):
  function xformer_attn_forward (line 198) | def xformer_attn_forward(self, h_):
  function attn2task (line 229) | def attn2task(task_queue, net):
  function resblock2task (line 247) | def resblock2task(queue, block):
  function build_sampling (line 278) | def build_sampling(task_queue, net, is_decoder):
  function build_task_queue (line 329) | def build_task_queue(net, is_decoder):
  function clone_task_queue (line 359) | def clone_task_queue(task_queue):
  function get_var_mean (line 368) | def get_var_mean(input, num_groups, eps=1e-6):
  function custom_group_norm (line 381) | def custom_group_norm(input, num_groups, mean, var, weight=None, bias=No...
  function crop_valid_region (line 413) | def crop_valid_region(x, input_bbox, target_bbox, is_decoder):
  function perfcount (line 429) | def perfcount(fn):
  class GroupNormParam (line 456) | class GroupNormParam:
    method __init__ (line 457) | def __init__(self):
    method add_tile (line 464) | def add_tile(self, tile, layer):
    method summary (line 486) | def summary(self):
    method from_tile (line 508) | def from_tile(tile, norm):
  class VAEHook (line 534) | class VAEHook:
    method __init__ (line 535) | def __init__(self, net, tile_size, is_decoder, fast_decoder, fast_enco...
    method __call__ (line 545) | def __call__(self, x):
    method get_best_tile_size (line 565) | def get_best_tile_size(self, lowerbound, upperbound):
    method split_tiles (line 580) | def split_tiles(self, h, w):
    method estimate_group_norm (line 640) | def estimate_group_norm(self, z, task_queue, color_fix):
    method vae_tile_forward (line 684) | def vae_tile_forward(self, z):

FILE: utils/wavelet_color_fix.py
  function adain_color_fix (line 14) | def adain_color_fix(target: Image, source: Image):
  function wavelet_color_fix (line 29) | def wavelet_color_fix(target: Image, source: Image):
  function calc_mean_std (line 44) | def calc_mean_std(feat: Tensor, eps=1e-5):
  function adaptive_instance_normalization (line 59) | def adaptive_instance_normalization(content_feat:Tensor, style_feat:Tens...
  function wavelet_blur (line 73) | def wavelet_blur(image: Tensor, radius: int):
  function wavelet_decomposition (line 94) | def wavelet_decomposition(image: Tensor, levels=5):
  function wavelet_reconstruction (line 108) | def wavelet_reconstruction(content_feat:Tensor, style_feat:Tensor):

Download .json

Condensed preview — 80 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (893K chars).

[
  {
    "path": ".idea/CCSR.iml",
    "chars": 441,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<module type=\"PYTHON_MODULE\" version=\"4\">\n  <component name=\"NewModuleRootManager"
  },
  {
    "path": ".idea/inspectionProfiles/Project_Default.xml",
    "chars": 510,
    "preview": "<component name=\"InspectionProjectProfileManager\">\n  <profile version=\"1.0\">\n    <option name=\"myName\" value=\"Project De"
  },
  {
    "path": ".idea/inspectionProfiles/profiles_settings.xml",
    "chars": 174,
    "preview": "<component name=\"InspectionProjectProfileManager\">\n  <settings>\n    <option name=\"USE_PROJECT_PROFILE\" value=\"false\" />\n"
  },
  {
    "path": ".idea/modules.xml",
    "chars": 260,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"ProjectModuleManager\">\n    <modules>\n   "
  },
  {
    "path": ".idea/vcs.xml",
    "chars": 180,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"VcsDirectoryMappings\">\n    <mapping dire"
  },
  {
    "path": ".idea/workspace.xml",
    "chars": 1567,
    "preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"ChangeListManager\">\n    <list default=\"t"
  },
  {
    "path": "ADD/dnnlib/__init__.py",
    "chars": 488,
    "preview": "# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/dnnlib/util.py",
    "chars": 17209,
    "preview": "# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/layers/__init__.py",
    "chars": 411,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the Apache License, Version "
  },
  {
    "path": "ADD/layers/attention.py",
    "chars": 2652,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the Apache License, Version "
  },
  {
    "path": "ADD/layers/block.py",
    "chars": 9614,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the Apache License, Version "
  },
  {
    "path": "ADD/layers/dino_head.py",
    "chars": 2007,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the Apache License, Version "
  },
  {
    "path": "ADD/layers/drop_path.py",
    "chars": 1157,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the Apache License, Version "
  },
  {
    "path": "ADD/layers/layer_scale.py",
    "chars": 820,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the Apache License, Version "
  },
  {
    "path": "ADD/layers/mlp.py",
    "chars": 1269,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the Apache License, Version "
  },
  {
    "path": "ADD/layers/patch_embed.py",
    "chars": 2836,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the Apache License, Version "
  },
  {
    "path": "ADD/layers/swiglu_ffn.py",
    "chars": 2182,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the Apache License, Version "
  },
  {
    "path": "ADD/models/discriminator.py",
    "chars": 6153,
    "preview": "# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/models/vit.py",
    "chars": 14210,
    "preview": "# Copyright (c) Meta Platforms, Inc. and affiliates.\n#\n# This source code is licensed under the Apache License, Version "
  },
  {
    "path": "ADD/th_utils/__init__.py",
    "chars": 448,
    "preview": "# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/th_utils/custom_ops.py",
    "chars": 6646,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/th_utils/misc.py",
    "chars": 11692,
    "preview": "# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/th_utils/ops/__init__.py",
    "chars": 448,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/th_utils/ops/bias_act.cpp",
    "chars": 4389,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/bias_act.cu",
    "chars": 6160,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/bias_act.h",
    "chars": 1293,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/bias_act.py",
    "chars": 9859,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/th_utils/ops/conv2d_gradfix.py",
    "chars": 9745,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/th_utils/ops/conv2d_resample.py",
    "chars": 6765,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/th_utils/ops/filtered_lrelu.cpp",
    "chars": 15597,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/filtered_lrelu.cu",
    "chars": 67307,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/filtered_lrelu.h",
    "chars": 4448,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/filtered_lrelu.py",
    "chars": 12916,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/th_utils/ops/filtered_lrelu_ns.cu",
    "chars": 1638,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/filtered_lrelu_rd.cu",
    "chars": 1607,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/filtered_lrelu_wr.cu",
    "chars": 1608,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/fma.py",
    "chars": 2047,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/th_utils/ops/grid_sample_gradfix.py",
    "chars": 3399,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/th_utils/ops/upfirdn2d.cpp",
    "chars": 5027,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/upfirdn2d.cu",
    "chars": 23137,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/upfirdn2d.h",
    "chars": 1849,
    "preview": "// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n//\n// NVIDIA CORPORATION and its licensors"
  },
  {
    "path": "ADD/th_utils/ops/upfirdn2d.py",
    "chars": 16424,
    "preview": "# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "ADD/utils/util_net.py",
    "chars": 5867,
    "preview": "#!/usr/bin/env python\n# -*- coding:utf-8 -*-\n# Power by Zongsheng Yue 2021-11-24 20:29:36\n\nimport math\nimport torch\nfrom"
  },
  {
    "path": "LICENSE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README.md",
    "chars": 13941,
    "preview": "<p align=\"center\">\n    <img src=\"figs/logo.png\" width=\"400\">\n</p>\n\n<div align=\"center\">\n<h2>Improving the Stability and "
  },
  {
    "path": "dataloaders/paired_dataset_txt.py",
    "chars": 2026,
    "preview": "import glob\nimport os\nfrom PIL import Image\nimport random\nimport numpy as np\n\nfrom torch import nn\nfrom torchvision impo"
  },
  {
    "path": "dataloaders/params_ccsr.yml",
    "chars": 1149,
    "preview": "scale: 4\ncolor_jitter_prob: 0.0\ngray_prob: 0.0\n\n# the first degradation process\nresize_prob: [0.2, 0.7, 0.1]  # up, down"
  },
  {
    "path": "dataloaders/realesrgan.py",
    "chars": 12969,
    "preview": "import os\nimport numpy as np\nimport cv2\nimport glob\nimport math\nimport yaml\nimport random\nfrom collections import Ordere"
  },
  {
    "path": "models/DiffAugment.py",
    "chars": 5490,
    "preview": "# BSD 2-Clause \"Simplified\" License\n# Copyright (c) 2020, Shengyu Zhao, Zhijian Liu, Ji Lin, Jun-Yan Zhu, and Song Han\n#"
  },
  {
    "path": "models/controlnet.py",
    "chars": 41842,
    "preview": "# Copyright 2023 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "models/losses/__init__.py",
    "chars": 63,
    "preview": "from models.losses.contperceptual import LPIPSWithDiscriminator"
  },
  {
    "path": "models/losses/contperceptual.py",
    "chars": 8323,
    "preview": "import torch\nimport torch.nn as nn\n\nfrom taming.modules.losses.vqperceptual import *  # TODO: taming dependency yes/no?\n"
  },
  {
    "path": "models/losses/vqperceptual.py",
    "chars": 8673,
    "preview": "import torch\nfrom torch import nn\nimport torch.nn.functional as F\nfrom einops import repeat\n\nfrom taming.modules.discrim"
  },
  {
    "path": "models/shared.py",
    "chars": 4045,
    "preview": "# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.\n#\n# NVIDIA CORPORATION and its licensors re"
  },
  {
    "path": "models/unet_2d_blocks.py",
    "chars": 122480,
    "preview": "# Copyright 2023 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "models/unet_2d_condition.py",
    "chars": 55424,
    "preview": "# Copyright 2023 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "models/vit_utils.py",
    "chars": 6194,
    "preview": "# MIT License\n#\n# Copyright (c) 2021 Intel ISL (Intel Intelligent Systems Lab)\n#\n# Permission is hereby granted, free of"
  },
  {
    "path": "myutils/devices.py",
    "chars": 3315,
    "preview": "import sys\nimport contextlib\nfrom functools import lru_cache\n\nimport torch\n#from modules import errors\n\nif sys.platform "
  },
  {
    "path": "myutils/img_util.py",
    "chars": 1177,
    "preview": "import os\nimport PIL\nimport cv2\nimport math\nimport numpy as np\nimport torch\nimport torchvision\nimport imageio\n\nfrom eino"
  },
  {
    "path": "myutils/misc.py",
    "chars": 1993,
    "preview": "import os\nimport binascii\nfrom safetensors import safe_open\n\nimport torch\n\nfrom diffusers.pipelines.stable_diffusion.con"
  },
  {
    "path": "myutils/vaehook.py",
    "chars": 31803,
    "preview": "# ------------------------------------------------------------------------\n#\n#   Ultimate VAE Tile Optimization\n#\n#   In"
  },
  {
    "path": "myutils/wavelet_color_fix.py",
    "chars": 4482,
    "preview": "'''\n# --------------------------------------------------------------------------------\n#   Color fixed script from Li Yi"
  },
  {
    "path": "pipelines/pipeline_ccsr.py",
    "chars": 51664,
    "preview": "# Copyright 2023 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "requirements.txt",
    "chars": 230,
    "preview": "diffusers==0.21.0\ntorch==2.0.1\npytorch_lightning\naccelerate==1.2.0\ntransformers==4.25.0\nxformers==0.0.22 \nloralib\nfairsc"
  },
  {
    "path": "scripts/get_path.py",
    "chars": 378,
    "preview": "import os\n\ndef write_png_paths(folder_path, txt_path):\n    with open(txt_path, 'w') as f:\n        for root, dirs, files "
  },
  {
    "path": "scripts/test/test_ccsr_multistep.sh",
    "chars": 482,
    "preview": "python test_ccsr_tile.py \\\n--pretrained_model_path preset/models/stable-diffusion-2-1-base \\\n--controlnet_model_path pre"
  },
  {
    "path": "scripts/test/test_ccsr_onestep.sh",
    "chars": 466,
    "preview": "\npython test_ccsr_tile.py \\\n--pretrained_model_path preset/models/stable-diffusion-2-1-base \\\n--controlnet_model_path pr"
  },
  {
    "path": "scripts/test/test_ccsr_tile.sh",
    "chars": 635,
    "preview": "python test_ccsr_tile.py \\\n--pretrained_model_path preset/models/stable-diffusion-2-1-base \\\n--controlnet_model_path pre"
  },
  {
    "path": "scripts/train/train_ccsr_stage1.sh",
    "chars": 583,
    "preview": "CUDA_VISIBLE_DEVICES=\"0,1,2,3,\" accelerate launch train_ccsr_stage1.py \\\n--pretrained_model_name_or_path=\"preset/models/"
  },
  {
    "path": "scripts/train/train_ccsr_stage2.sh",
    "chars": 711,
    "preview": "CUDA_VISIBLE_DEVICES=\"0,1,2,3,\" accelerate launch train_ccsr_stage2.py \\\n--pretrained_model_name_or_path=\"preset/models/"
  },
  {
    "path": "scripts/train/train_controlnet.sh",
    "chars": 550,
    "preview": "\nCUDA_VISIBLE_DEVICES=\"0,1,2,3,\" accelerate launch train_controlnet.py \\\n--pretrained_model_name_or_path=\"preset/models/"
  },
  {
    "path": "test_ccsr_tile.py",
    "chars": 12140,
    "preview": "import os\nimport glob\nimport math\nimport time\nimport argparse\n\nimport numpy as np\nfrom PIL import Image\nimport safetenso"
  },
  {
    "path": "train_ccsr_stage1.py",
    "chars": 43150,
    "preview": "#!/usr/bin/env python\n# coding=utf-8\n# Copyright 2023 The HuggingFace Inc. team. All rights reserved.\n#\n# Licensed under"
  },
  {
    "path": "train_ccsr_stage2.py",
    "chars": 49302,
    "preview": "#!/usr/bin/env python\n# coding=utf-8\n# Copyright 2023 The HuggingFace Inc. team. All rights reserved.\n#\n# Licensed under"
  },
  {
    "path": "train_controlnet.py",
    "chars": 38920,
    "preview": "#!/usr/bin/env python\n# coding=utf-8\n# Copyright 2023 The HuggingFace Inc. team. All rights reserved.\n#\n# Licensed under"
  },
  {
    "path": "utils/devices.py",
    "chars": 3315,
    "preview": "import sys\nimport contextlib\nfrom functools import lru_cache\n\nimport torch\n#from modules import errors\n\nif sys.platform "
  },
  {
    "path": "utils/img_util.py",
    "chars": 1177,
    "preview": "import os\nimport PIL\nimport cv2\nimport math\nimport numpy as np\nimport torch\nimport torchvision\nimport imageio\n\nfrom eino"
  },
  {
    "path": "utils/misc.py",
    "chars": 1993,
    "preview": "import os\nimport binascii\nfrom safetensors import safe_open\n\nimport torch\n\nfrom diffusers.pipelines.stable_diffusion.con"
  },
  {
    "path": "utils/vaehook.py",
    "chars": 31687,
    "preview": "# ------------------------------------------------------------------------\n#\n#   Ultimate VAE Tile Optimization\n#\n#   In"
  },
  {
    "path": "utils/wavelet_color_fix.py",
    "chars": 4482,
    "preview": "'''\n# --------------------------------------------------------------------------------\n#   Color fixed script from Li Yi"
  }
]

About this extraction

This page contains the full source code of the csslc/CCSR GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 80 files (842.8 KB), approximately 207.0k tokens, and a symbol index with 535 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo