Repository: jankrepl/mildlyoverfitted
Branch: master
Commit: 22f0ecc67cef
Files: 118
Total size: 314.6 KB

Directory structure:
gitextract_ixgqmhua/

├── .gitignore
├── LICENSE
├── README.md
├── github_adventures/
│   ├── automata/
│   │   ├── model.py
│   │   └── train.py
│   ├── diffaugment/
│   │   ├── README.MD
│   │   ├── script.py
│   │   └── utils.py
│   ├── dino/
│   │   ├── data/
│   │   │   ├── README.md
│   │   │   └── imagenette_labels.json
│   │   ├── evaluation.py
│   │   ├── train.py
│   │   ├── utils.py
│   │   ├── visualize_attentions.ipynb
│   │   └── visualize_augmentations.ipynb
│   ├── gpt/
│   │   ├── README.md
│   │   ├── copy_and_generate.py
│   │   ├── distribution_visualizations.ipynb
│   │   ├── ipython_code.py
│   │   ├── model.py
│   │   ├── requirements.txt
│   │   └── utils.py
│   ├── integer/
│   │   ├── README.md
│   │   ├── bert.py
│   │   ├── experiments.sh
│   │   ├── fetch_data.py
│   │   ├── glove.py
│   │   ├── lstm.py
│   │   ├── requirements.txt
│   │   └── utils.py
│   ├── lottery/
│   │   ├── README.md
│   │   ├── data.py
│   │   ├── main.py
│   │   ├── parallel_launch.sh
│   │   ├── requirements.txt
│   │   └── utils.py
│   ├── mixer/
│   │   ├── README.md
│   │   ├── official.py
│   │   ├── ours.py
│   │   └── test_compare.py
│   ├── mixup/
│   │   ├── launch_experiments.sh
│   │   ├── train.py
│   │   └── utils.py
│   ├── ner_evaluation/
│   │   ├── README.md
│   │   ├── ours.py
│   │   ├── test_ours.py
│   │   └── try.py
│   ├── neuron/
│   │   ├── README.md
│   │   ├── evaluate_noise.py
│   │   ├── evaluate_shuffling.py
│   │   ├── evaluate_video.py
│   │   ├── launch.sh
│   │   ├── pretrained/
│   │   │   ├── MLP.pkl
│   │   │   ├── MLP_augment.pkl
│   │   │   ├── invariant_official.pkl
│   │   │   ├── invariant_ours.pkl
│   │   │   ├── linear.pkl
│   │   │   └── linear_augment.pkl
│   │   ├── requirements.txt
│   │   ├── solutions.py
│   │   ├── tasks.py
│   │   ├── torch_utils.py
│   │   └── trainer.py
│   ├── pondernet/
│   │   ├── experiment_1.sh
│   │   ├── experiment_2.sh
│   │   ├── requirements.txt
│   │   ├── train.py
│   │   └── utils.py
│   ├── product_quantization/
│   │   ├── README.md
│   │   ├── convert.py
│   │   ├── custom.py
│   │   ├── faiss_101_ipython.py
│   │   ├── generate_index.py
│   │   ├── parse.py
│   │   ├── requirements.txt
│   │   ├── run_all.sh
│   │   └── run_gradio.py
│   ├── siren/
│   │   ├── activations.py
│   │   ├── core.py
│   │   └── train.py
│   └── vision_transformer/
│       ├── classes.txt
│       ├── custom.py
│       ├── forward.py
│       └── verify.py
└── mini_tutorials/
    ├── bentoml/
    │   ├── README.md
    │   ├── bentofile.yaml
    │   ├── create_model.py
    │   ├── requirements.txt
    │   └── service.py
    ├── custom_optimizer_in_pytorch/
    │   ├── custom.py
    │   └── src.py
    ├── deploying_on_kubernetes/
    │   ├── Dockerfile
    │   ├── DockerfileConda
    │   └── README.md
    ├── embedding/
    │   ├── README.md
    │   ├── Visualize.ipynb
    │   └── src.py
    ├── fewshot_text_classification/
    │   ├── classify.py
    │   └── template.jinja2
    ├── gradient_wrt_input/
    │   ├── explain.py
    │   ├── fool.py
    │   └── utils.py
    ├── haiku_basics/
    │   ├── buffers_in_torch.py
    │   ├── parameter.py
    │   ├── reallife.py
    │   ├── requirements.txt
    │   └── state.py
    ├── httpx_rate_limiting/
    │   └── script.py
    ├── mocking_neural_networks/
    │   ├── app.py
    │   └── test.py
    ├── numpy_equality_testing/
    │   └── test.py
    ├── openai_function_calling/
    │   └── example.py
    ├── rag_with_reranking/
    │   ├── README.md
    │   ├── answer.py
    │   ├── input.txt
    │   ├── postman_collection.json
    │   └── upload_data.py
    └── visualizing_activations_with_forward_hooks/
        └── src.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2024 Jan Krepl

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# mildlyoverfitted

Code for https://www.youtube.com/c/mildlyoverfitted.


### Overview
| Name                                                                           | Video                                | Code                                                                                                                       |
|--------------------------------------------------------------------------------|--------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
| Asynchronous requests and rate limiting                                        | [link](https://youtu.be/luWsr9exlE4) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/httpx_rate_limiting)                |
| BentoML Sagemaker deployment                                                   | [link](https://youtu.be/Zci_D4az9FU) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/bentoml)                |
| Custom optimizer in PyTorch                                                    | [link](https://youtu.be/zvp8K4iX2Cs) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/custom_optimizer_in_pytorch)                |
| Deploying machine learning models on Kubernetes                                | [link](https://youtu.be/DQRNt8Diyw4) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/deploying_on_kubernetes)                             |
| Differentiable augmentation for GANs (using Kornia)                            | [link](https://youtu.be/J97EM3Clyys) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/diffaugment)                             |
| DINO in PyTorch                                                                | [link](https://youtu.be/psmMEWKk4Uk) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/dino)                                    |
| Few-shot text classification with prompts                                      | [link](https://youtu.be/AhqgDXcBU2M) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/fewshot_text_classification)                                    |
| GPT in PyTorch                                                                 | [link](https://youtu.be/d7IRM40VMYM) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/gpt)                                    |
| Gradient with respect to input in PyTorch (FGSM attack + Integrated Gradients) | [link](https://youtu.be/5lFiZTSsp40) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/gradient_wrt_input)                         |
| Growing neural cellular automata in PyTorch                                    | [link](https://youtu.be/21ACbWoF2Oo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/automata)                                |
| Haiku basics                                                                   | [link](https://youtu.be/yXCKS-ZoYTY) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/haiku_basics)                            |
| Integer embeddings in PyTorch                                                  | [link](https://youtu.be/bybuSBVzOdg) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/integer)                                 |
| Mixup in PyTorch                                                               | [link](https://youtu.be/hGAKHKqmXdY) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/mixup)                                   |
| MLP-Mixer in Flax and PyTorch                                                  | [link](https://youtu.be/HqytB2GUbHA) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/mixer)                                   |
| Mocking neural networks: unit testing in deep learning                         | [link](https://youtu.be/_KVV9jXSzvo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/mocking_neural_networks)                    |
| NER model evaluation                                                           | [link](https://youtu.be/70YAUYP3hrw) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/ner_evaluation)                                    |
| NumPy equality testing                                                         | [link](https://youtu.be/sai1g5fjyb8) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/numpy_equality_testing)                     |
| OpenAI function calling                                                        | [link](https://youtu.be/_B7F_6nTVEg) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/openai_function_calling)                     |
| PonderNet in PyTorch                                                           | [link](https://youtu.be/JLFz1dU5HR4) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/pondernet)                               |
| Product quantization in Faiss and from scratch                                 | [link](https://youtu.be/PNVJvZEkuXo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/product_quantization)                               |
| Retrieval augmented generation with OpenSearch and reranking                   | [link](https://youtu.be/OsE7YcDcPz0) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/rag_with_reranking)                               |
| SIREN in PyTorch                                                               | [link](https://youtu.be/s4iFEoNlYhM) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/siren)                                   |
| The Lottery Ticket Hypothesis and pruning in PyTorch                           | [link](https://youtu.be/bQt0CLXXAqg) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/lottery)                                  |
| The Sensory Neuron as a Transformer in PyTorch                                 | [link](https://youtu.be/mi_mzlhBGAU) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/neuron)                                  |
| `torch.nn.Embedding` explained (+ Character-level language model)              | [link](https://youtu.be/euwN5DHfLEo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/embedding)                                  |
| Vision Transformer in PyTorch                                                  | [link](https://youtu.be/ovB0ddFtzzA) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/vision_transformer)                      |
| Visualizing activations with forward hooks (PyTorch)                           | [link](https://youtu.be/1ZbLA7ofasY) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/visualizing_activations_with_forward_hooks) |


================================================
FILE: github_adventures/automata/model.py
================================================
import torch
import torch.nn as nn


class CAModel(nn.Module):
    """Cell automata model.

    Parameters
    ----------
    n_channels : int
        Number of channels of the grid.

    hidden_channels : int
        Hidden channels that are related to the pixelwise 1x1 convolution.

    fire_rate : float
        Number between 0 and 1. The lower it is the more likely it is for
        cells to be set to zero during the `stochastic_update` process.

    device : torch.device
        Determines on what device we perfrom all the computations.

    Attributes
    ----------
    update_module : nn.Sequential
        The only part of the network containing trainable parameters. Composed
        of 1x1 convolution, ReLu and 1x1 convolution.

    filters : torch.Tensor
        Constant tensor of shape `(3 * n_channels, 1, 3, 3)`.
    """
    def __init__(self, n_channels=16, hidden_channels=128, fire_rate=0.5, device=None):
        super().__init__()


        self.fire_rate = 0.5
        self.n_channels = n_channels
        self.device = device or torch.device("cpu")

        # Perceive step
        sobel_filter_ = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
        scalar = 8.0

        sobel_filter_x = sobel_filter_ / scalar
        sobel_filter_y = sobel_filter_.t() / scalar
        identity_filter = torch.tensor(
                [
                    [0, 0, 0],
                    [0, 1, 0],
                    [0, 0, 0],
                ],
                dtype=torch.float32,
        )
        filters = torch.stack(
                [identity_filter, sobel_filter_x, sobel_filter_y]
        )  # (3, 3, 3)
        filters = filters.repeat((n_channels, 1, 1))  # (3 * n_channels, 3, 3)
        self.filters = filters[:, None, ...].to(
                self.device
        )  # (3 * n_channels, 1, 3, 3)

        # Update step
        self.update_module = nn.Sequential(
                nn.Conv2d(
                    3 * n_channels,
                    hidden_channels,
                    kernel_size=1,  # (1, 1)
                ),
                nn.ReLU(),
                nn.Conv2d(
                    hidden_channels,
                    n_channels,
                    kernel_size=1,
                    bias=False,
                ),
        )

        with torch.no_grad():
            self.update_module[2].weight.zero_()

        self.to(self.device)

    def perceive(self, x):
        """Approximate channelwise gradient and combine with the input.

        This is the only place where we include information on the
        neighboring cells. However, we are not using any learnable
        parameters here.

        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, n_channels, grid_size, grid_size)`.

        Returns
        -------
        torch.Tensor
            Shape `(n_samples, 3 * n_channels, grid_size, grid_size)`.
        """
        return nn.functional.conv2d(x, self.filters, padding=1, groups=self.n_channels)

    def update(self, x):
        """Perform update.

        Note that this is the only part of the forward pass that uses
        trainable parameters

        Paramters
        ---------
        x : torch.Tensor
            Shape `(n_samples, 3 * n_channels, grid_size, grid_size)`.

        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_channels, grid_size, grid_size)`.
        """
        return self.update_module(x)

    @staticmethod
    def stochastic_update(x, fire_rate):
        """Run pixel-wise dropout.

        Unlike dropout there is no scaling taking place.

        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, n_channels, grid_size, grid_size)`.

        fire_rate : float
            Number between 0 and 1. The higher the more likely a given cell
            updates.

        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_channels, grid_size, grid_size)`.
        """
        device = x.device

        mask = (torch.rand(x[:, :1, :, :].shape) <= fire_rate).to(device, torch.float32)
        return x * mask  # broadcasted over all channels

    @staticmethod
    def get_living_mask(x):
        """Identify living cells.

        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, n_channels, grid_size, grid_size)`.

        Returns
        -------
        torch.Tensor
            Shape `(n_samples, 1, grid_size, grid_size)` and the
            dtype is bool.
        """
        return (
            nn.functional.max_pool2d(
                x[:, 3:4, :, :], kernel_size=3, stride=1, padding=1
            )
            > 0.1
        )

    def forward(self, x):
        """Run the forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, n_channels, grid_size, grid_size)`.

        Returns
        -------
        torch.Tensor
            Shape `(n_sample, n_channels, grid_size, grid_size)`.
        """
        pre_life_mask = self.get_living_mask(x)

        y = self.perceive(x)
        dx = self.update(y)
        dx = self.stochastic_update(dx, fire_rate=self.fire_rate)

        x = x + dx

        post_life_mask = self.get_living_mask(x)
        life_mask = (pre_life_mask & post_life_mask).to(torch.float32)

        return x * life_mask


================================================
FILE: github_adventures/automata/train.py
================================================
import argparse
import pathlib

import numpy as np
import torch
import torch.nn as nn
from PIL import Image
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

from model import CAModel


def load_image(path, size=40):
    """Load an image.

    Parameters
    ----------
    path : pathlib.Path
        Path to where the image is located. Note that the image needs to be
        RGBA.

    size : int
        The image will be resized to a square wit ha side length of `size`.

    Returns
    -------
    torch.Tensor
        4D float image of shape `(1, 4, size, size)`. The RGB channels
        are premultiplied by the alpha channel.
    """
    img = Image.open(path)
    img = img.resize((size, size), Image.ANTIALIAS)
    img = np.float32(img) / 255.0
    img[..., :3] *= img[..., 3:]

    return torch.from_numpy(img).permute(2, 0, 1)[None, ...]


def to_rgb(img_rgba):
    """Convert RGBA image to RGB image.

    Parameters
    ----------
    img_rgba : torch.Tensor
        4D tensor of shape `(1, 4, size, size)` where the RGB channels
        were already multiplied by the alpha.

    Returns
    -------
    img_rgb : torch.Tensor
        4D tensor of shape `(1, 3, size, size)`.
    """
    rgb, a = img_rgba[:, :3, ...], torch.clamp(img_rgba[:, 3:, ...], 0, 1)
    return torch.clamp(1.0 - a + rgb, 0, 1)


def make_seed(size, n_channels):
    """Create a starting tensor for training.

    The only active pixels are going to be in the middle.

    Parameters
    ----------
    size : int
        The height and the width of the tensor.

    n_channels : int
        Overall number of channels. Note that it needs to be higher than 4
        since the first 4 channels represent RGBA.

    Returns
    -------
    torch.Tensor
        4D float tensor of shape `(1, n_chanels, size, size)`.
    """
    x = torch.zeros((1, n_channels, size, size), dtype=torch.float32)
    x[:, 3:, size // 2, size // 2] = 1
    return x


def main(argv=None):
    parser = argparse.ArgumentParser(
            description="Training script for the Celluar Automata"
    )
    parser.add_argument("img", type=str, help="Path to the image we want to reproduce")

    parser.add_argument(
            "-b",
            "--batch-size",
            type=int,
            default=8,
            help="Batch size. Samples will always be taken randomly from the pool."
    )
    parser.add_argument(
            "-d",
            "--device",
            type=str,
            default="cpu",
            help="Device to use",
            choices=("cpu", "cuda"),
    )
    parser.add_argument(
            "-e",
            "--eval-frequency",
            type=int,
            default=500,
            help="Evaluation frequency.",
    )
    parser.add_argument(
            "-i",
            "--eval-iterations",
            type=int,
            default=300,
            help="Number of iterations when evaluating.",
    )
    parser.add_argument(
            "-n",
            "--n-batches",
            type=int,
            default=5000,
            help="Number of batches to train for.",
    )
    parser.add_argument(
            "-c",
            "--n-channels",
            type=int,
            default=16,
            help="Number of channels of the input tensor",
    )
    parser.add_argument(
            "-l",
            "--logdir",
            type=str,
            default="logs",
            help="Folder where all the logs and outputs are saved.",
    )
    parser.add_argument(
            "-p",
            "--padding",
            type=int,
            default=16,
            help="Padding. The shape after padding is (h + 2 * p, w + 2 * p).",
    )
    parser.add_argument(
            "--pool-size",
            type=int,
            default=1024,
            help="Size of the training pool",
    )
    parser.add_argument(
            "-s",
            "--size",
            type=int,
            default=40,
            help="Image size",
    )
    # Parse arguments
    args = parser.parse_args()
    print(vars(args))

    # Misc
    device = torch.device(args.device)

    log_path = pathlib.Path(args.logdir)
    log_path.mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter(log_path)

    # Target image
    target_img_ = load_image(args.img, size=args.size)
    p = args.padding
    target_img_ = nn.functional.pad(target_img_, (p, p, p, p), "constant", 0)
    target_img = target_img_.to(device)
    target_img = target_img.repeat(args.batch_size, 1, 1, 1)

    writer.add_image("ground truth", to_rgb(target_img_)[0])

    # Model and optimizer
    model = CAModel(n_channels=args.n_channels, device=device)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)

    # Pool initialization
    seed = make_seed(args.size, args.n_channels).to(device)
    seed = nn.functional.pad(seed, (p, p, p, p), "constant", 0)
    pool = seed.clone().repeat(args.pool_size, 1, 1, 1)

    for it in tqdm(range(args.n_batches)):
        batch_ixs = np.random.choice(
                args.pool_size, args.batch_size, replace=False
        ).tolist()

        x = pool[batch_ixs]
        for i in range(np.random.randint(64, 96)):
            x = model(x)

        loss_batch = ((target_img - x[:, :4, ...]) ** 2).mean(dim=[1, 2, 3])
        loss = loss_batch.mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        writer.add_scalar("train/loss", loss, it)

        argmax_batch = loss_batch.argmax().item()
        argmax_pool = batch_ixs[argmax_batch]
        remaining_batch = [i for i in range(args.batch_size) if i != argmax_batch]
        remaining_pool = [i for i in batch_ixs if i != argmax_pool]

        pool[argmax_pool] = seed.clone()
        pool[remaining_pool] = x[remaining_batch].detach()

        if it % args.eval_frequency == 0:
            x_eval = seed.clone()  # (1, n_channels, size, size)

            eval_video = torch.empty(1, args.eval_iterations, 3, *x_eval.shape[2:])

            for it_eval in range(args.eval_iterations):
                x_eval = model(x_eval)
                x_eval_out = to_rgb(x_eval[:, :4].detach().cpu())
                eval_video[0, it_eval] = x_eval_out

            writer.add_video("eval", eval_video, it, fps=60)


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/diffaugment/README.MD
================================================
# Data
https://hanlab.mit.edu/projects/data-efficient-gans/datasets/100-shot-grumpy_cat.zip

Just unzip it into `data/` and the code should work out of the box.


================================================
FILE: github_adventures/diffaugment/script.py
================================================
import argparse
import pathlib
import pprint
from datetime import datetime

import kornia.augmentation as K
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision.utils import make_grid
from tqdm import tqdm

from utils import DatasetImages, Discriminator, Generator, init_weights_


def main(argv=None):
    # CLI
    parser = argparse.ArgumentParser()
    parser.add_argument("name", help="Name of the experiment")
    parser.add_argument(
        "-a",
        "--augment",
        action="store_true",
        help="If True, we apply augmentations",
    )
    parser.add_argument(
        "-b", "--batch-size", type=int, default=16, help="Batch size"
    )
    parser.add_argument(
        "--b1",
        type=float,
        default=0.5,
        help="Adam optimizer hyperparamter",
    )
    parser.add_argument(
        "--b2",
        type=float,
        default=0.999,
        help="Adam optimizer hyperparamter",
    )
    parser.add_argument(
        "-d",
        "--device",
        type=str,
        default="cpu",
        choices=["cpu", "cuda"],
        help="Device to use",
    )
    parser.add_argument(
        "--eval-frequency",
        type=int,
        default=400,
        help="Generate generator images every `eval_frequency` epochs",
    )
    parser.add_argument(
        "--latent-dim",
        type=int,
        default=100,
        help="Dimensionality of the random noise",
    )
    parser.add_argument(
        "--lr", type=float, default=0.0002, help="Learning rate"
    )
    parser.add_argument(
        "--ndf",
        type=int,
        default=32,
        help="Number of discriminator feature maps (after first convolution)",
    )
    parser.add_argument(
        "--ngf",
        type=int,
        default=32,
        help="Number of generator feature maps (before last transposed convolution)",
    )
    parser.add_argument(
        "-n",
        "--n-epochs",
        type=int,
        default=200,
        help="Number of training epochs",
    )
    parser.add_argument(
        "--mosaic-size",
        type=int,
        default=10,
        help="Size of the side of the rectangular mosaic",
    )
    parser.add_argument(
        "-p",
        "--prob",
        type=float,
        default=0.9,
        help="Probability of applying an augmentation",
    )

    args = parser.parse_args(argv)
    args_d = vars(args)
    print(args)

    img_size = 128

    # Additional parameters
    device = torch.device(args.device)
    mosaic_kwargs = {"nrow": args.mosaic_size, "normalize": True}
    n_mosaic_cells = args.mosaic_size * args.mosaic_size
    sample_showcase_ix = (
        0  # this one will be used to demonstrate the augmentations
    )

    augment_module = torch.nn.Sequential(
        K.RandomAffine(degrees=0, translate=(1 / 8, 1 / 8), p=args.prob),
        K.RandomErasing((0.0, 0.5), p=args.prob),
    )

    # Loss function
    adversarial_loss = torch.nn.BCELoss()

    # Initialize generator and discriminator
    generator = Generator(latent_dim=args.latent_dim, ngf=args.ngf)
    discriminator = Discriminator(
        ndf=args.ndf, augment_module=augment_module if args.augment else None
    )

    generator.to(device)
    discriminator.to(device)

    # Initialize weights
    generator.apply(init_weights_)
    discriminator.apply(init_weights_)

    # Configure data loader
    data_path = pathlib.Path("data")
    tform = transforms.Compose(
        [
            transforms.Resize(img_size),
            transforms.ToTensor(),
            transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
        ]
    )
    dataset = DatasetImages(
        data_path,
        transform=tform,
    )
    dataloader = DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=True,
    )

    # Optimizers
    optimizer_G = torch.optim.Adam(
        generator.parameters(), lr=args.lr, betas=(args.b1, args.b2)
    )
    optimizer_D = torch.optim.Adam(
        discriminator.parameters(), lr=args.lr, betas=(args.b1, args.b2)
    )

    # Output path and metadata
    output_path = pathlib.Path("outputs") / args.name
    output_path.mkdir(exist_ok=True, parents=True)

    # Add other parameters (not included in CLI)
    args_d["time"] = datetime.now()
    args_d["kornia"] = str(augment_module)

    # Prepare tensorboard writer
    writer = SummaryWriter(output_path)

    # Log hyperparameters as text
    writer.add_text(
        "hyperparameter",
        pprint.pformat(args_d).replace(
            "\n", "  \n"
        ),  # markdown needs 2 spaces before newline
        0,
    )
    # Log true data
    writer.add_image(
        "true_data",
        make_grid(
            torch.stack([dataset[i] for i in range(n_mosaic_cells)]),
            **mosaic_kwargs
        ),
        0,
    )
    # Log augmented data
    batch_showcase = dataset[sample_showcase_ix][None, ...].repeat(
        n_mosaic_cells, 1, 1, 1
    )
    batch_showcase_aug = discriminator.augment_module(batch_showcase)
    writer.add_image(
        "augmentations", make_grid(batch_showcase_aug, **mosaic_kwargs), 0
    )

    # Prepate evaluation noise
    z_eval = torch.randn(n_mosaic_cells, args.latent_dim).to(device)

    for epoch in tqdm(range(args.n_epochs)):
        for i, imgs in enumerate(dataloader):
            n_samples, *_ = imgs.shape
            batches_done = epoch * len(dataloader) + i

            # Adversarial ground truths
            valid = 0.9 * torch.ones(
                n_samples, 1, device=device, dtype=torch.float32
            )
            fake = torch.zeros(n_samples, 1, device=device, dtype=torch.float32)

            # D preparation
            optimizer_D.zero_grad()

            # D loss on reals
            real_imgs = imgs.to(device)
            d_x = discriminator(real_imgs)
            real_loss = adversarial_loss(d_x, valid)
            real_loss.backward()

            # D loss on fakes
            z = torch.randn(n_samples, args.latent_dim).to(device)
            gen_imgs = generator(z)
            d_g_z1 = discriminator(gen_imgs.detach())

            fake_loss = adversarial_loss(d_g_z1, fake)
            fake_loss.backward()

            optimizer_D.step()  # we called backward twice, the result is a sum

            # G preparation
            optimizer_G.zero_grad()

            # G loss
            d_g_z2 = discriminator(gen_imgs)
            g_loss = adversarial_loss(d_g_z2, valid)

            g_loss.backward()
            optimizer_G.step()

            # Logging
            if batches_done % 50 == 0:
                writer.add_scalar("d_x", d_x.mean().item(), batches_done)
                writer.add_scalar("d_g_z1", d_g_z1.mean().item(), batches_done)
                writer.add_scalar("d_g_z2", d_g_z2.mean().item(), batches_done)
                writer.add_scalar(
                    "D_loss", (real_loss + fake_loss).item(), batches_done
                )
                writer.add_scalar("G_loss", g_loss.item(), batches_done)

            if epoch % args.eval_frequency == 0 and i == 0:
                generator.eval()
                discriminator.eval()

                # Generate fake images
                gen_imgs_eval = generator(z_eval)

                # Generate nice mosaic
                writer.add_image(
                    "fake",
                    make_grid(gen_imgs_eval.data, **mosaic_kwargs),
                    batches_done,
                )

                # Save checkpoint (and potentially overwrite an existing one)
                torch.save(generator, output_path / "model.pt")

                # Make sure generator and discriminator in the training mode
                generator.train()
                discriminator.train()


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/diffaugment/utils.py
================================================
import torch.nn as nn
from PIL import Image
from torch.utils.data import Dataset


class DatasetImages(Dataset):
    """Dataset loading photos on the hard drive.

    Parameters
    ----------
    path : pathlib.Path
        Path to the folder containing all the images.

    transform : None or callable
        The transform to be applied when yielding the image.

    Attributes
    ----------
    all_paths : list
        List of all paths to the `.jpg` images.
    """
    def __init__(self, path, transform=None):
        super().__init__()

        self.all_paths = sorted([p for p in path.iterdir() if p.suffix == ".jpg"])
        self.transform = transform

    def __len__(self):
        """Compute length of the dataset."""
        return len(self.all_paths)

    def __getitem__(self, ix):
        """Get a single item."""
        img = Image.open(self.all_paths[ix])

        if self.transform is not None:
            img = self.transform(img)

        return img


class Generator(nn.Module):
    """Generator network.

    Parameters
    ----------
    latent_dim : int
        The dimensionality of the input noise.

    ngf : int
        Number of generator filters. Note that the actual number of filters
        will be a multiple of this number and is going to be divided by two in
        each consecutive block of the network.

    Attributes
    ----------
    main : torch.Sequential
        The actual network that is composed of `ConvTranspose2d`, `BatchNorm2d`
        and `ReLU` blocks.
    """

    def __init__(self, latent_dim, ngf=64):
        super().__init__()
        self.main = nn.Sequential(
            nn.ConvTranspose2d(latent_dim, ngf * 16, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 16),
            nn.ReLU(True),
            # (ngf * 16) x 4 x 4
            nn.ConvTranspose2d(ngf * 16, ngf * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),
            # (ngf * 8) x 8 x 8
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # (ngf * 4) x 16 x 16
            nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # (ngf * 2) x 32 x 32
            nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            # ngf x 64 x 64
            nn.ConvTranspose2d(ngf, 3, 4, 2, 1, bias=False),
            nn.Tanh(),
            # 3 x 128 x 128
        )

    def forward(self, x):
        """Run the forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Input noise of shape `(n_samples, latent_dim)`.

        Returns
        -------
        torch.Tensor
            Generated images of shape `(n_samples, 3, 128, 128)`.
        """
        x = x.reshape(*x.shape, 1, 1)  # (n_samples, latent_dim, 1, 1)
        return self.main(x)


class Discriminator(nn.Module):
    """Discriminator netowrk.

    Parameters
    ----------
    ndf : int
        Number of discriminator filters. It represents the number of filters
        after the first convolution block. Each consecutive block will double
        the number.

    augment_module : nn.Module or None
        If provided it represents the Kornia module that performs
        differentiable augmentation of the images.

    Attributes
    ----------
    augment_module : nn.Module
        If the input parameter `augment_module` provided then this is the
        same thing. If not, then this is just an identity mapping.
    """
    def __init__(self, ndf=16, augment_module=None):
        super().__init__()
        self.main = nn.Sequential(
            # 3 x 128 x 128
            nn.Conv2d(3, ndf, 4, stride=2, padding=1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # ndf x 64 x 64
            nn.Conv2d(ndf, ndf * 2, 4, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # (ndf * 2) x 32 x 32
            nn.Conv2d(ndf * 2, ndf * 4, 4, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # (ndf * 4) x 16 x 16
            nn.Conv2d(ndf * 4, ndf * 8, 4, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            # (ndf * 8) x 8 x 8
            nn.Conv2d(ndf * 8, ndf * 16, 4, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(ndf * 16),
            nn.LeakyReLU(0.2, inplace=True),
            # (ndf * 16) x 4 x 4
            nn.Conv2d(ndf * 16, 1, 4, stride=1, padding=0, bias=False),
            nn.Sigmoid()
            # 1 x 1 x 1
        )
        if augment_module is not None:
            self.augment_module = augment_module
        else:
            self.augment_module = nn.Identity()


    def forward(self, x):
        """Run the forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Input images of shape `(n_samples, 3, 128, 128)`.

        Returns
        -------
        torch.Tensor
            Classification outputs of shape `(n_samples, 1)`.
        """
        if self.training:
            x = self.augment_module(x)

        x = self.main(x)  # (n_samples, 1, 1, 1)
        x = x.reshape(len(x), -1)  # (n_samples, 1)
        return x


def init_weights_(module):
    """Initialize weights by sampling from a normal distribution.

    Note that this operation is modifying the weights in place.

    Parameters
    ----------
    module : nn.Module
        Module with trainable weights.
    """
    cls_name = module.__class__.__name__

    if cls_name in {"Conv2d", "ConvTranspose2d"}:
        nn.init.normal_(module.weight.data, 0.0, 0.02)

    elif cls_name == "BatchNorm2d":
        nn.init.normal_(module.weight.data, 1.0, 0.02)
        nn.init.constant_(module.bias.data, 0.0)


================================================
FILE: github_adventures/dino/data/README.md
================================================
The `Imagenette` dataset was used. You can find it here: https://github.com/fastai/imagenette (320 px version). 


================================================
FILE: github_adventures/dino/data/imagenette_labels.json
================================================
{"n01440764": "tench", "n02102040": "english_springer", "n02979186": "cassette_player", "n03000684": "chain_saw", "n03028079": "church", "n03394916": "french_horn", "n03417042": "garbage_truck", "n03425413": "gas_pump", "n03445777": "golf_ball", "n03888257": "parachute"}

================================================
FILE: github_adventures/dino/evaluation.py
================================================
import numpy as np
import torch
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier


def compute_knn(backbone, data_loader_train, data_loader_val):
    """Get CLS embeddings and use KNN classifier on them.

    We load all embeddings in memory and use sklearn. Should
    be doable.

    Parameters
    ----------
    backbone : timm.models.vision_transformer.VisionTransformer
        Vision transformer whose head is just an identity
        mapping.

    data_loader_train, data_loader_val : torch.utils.data.DataLoader
        Training and validation dataloader that does not apply any
        augmentations. Just casting to tensor and then normalizing.

    Returns
    -------
    val_accuracy : float
        Validation accuracy.
    """
    device = next(backbone.parameters()).device

    data_loaders = {
        "train": data_loader_train,
        "val": data_loader_val,
    }
    lists = {
        "X_train": [],
        "y_train": [],
        "X_val": [],
        "y_val": [],
    }

    for name, data_loader in data_loaders.items():
        for imgs, y in data_loader:
            imgs = imgs.to(device)
            lists[f"X_{name}"].append(backbone(imgs).detach().cpu().numpy())
            lists[f"y_{name}"].append(y.detach().cpu().numpy())

    arrays = {k: np.concatenate(l) for k, l in lists.items()}

    estimator = KNeighborsClassifier()
    estimator.fit(arrays["X_train"], arrays["y_train"])
    y_val_pred = estimator.predict(arrays["X_val"])

    acc = accuracy_score(arrays["y_val"], y_val_pred)

    return acc

def compute_embedding(backbone, data_loader):
    """Compute CLS embedding and prepare for TensorBoard.

    Parameters
    ----------
    backbone : timm.models.vision_transformer.VisionTransformer
        Vision transformer. The head should be an identity mapping.

    data_loader : torch.utils.data.DataLoader
        Validation dataloader that does not apply any augmentations. Just
        casting to tensor and then normalizing.

    Returns
    -------
    embs : torch.Tensor
        Embeddings of shape `(n_samples, out_dim)`.

    imgs : torch.Tensor
        Images of shape `(n_samples, 3, height, width)`.

    labels : list
        List of strings representing the classes.
    """
    device = next(backbone.parameters()).device

    embs_l = []
    imgs_l = []
    labels = []

    for img, y in data_loader:
        img = img.to(device)
        embs_l.append(backbone(img).detach().cpu())
        imgs_l.append(((img * 0.224) + 0.45).cpu())  # undo norm
        labels.extend([data_loader.dataset.classes[i] for i in y.tolist()])

    embs = torch.cat(embs_l, dim=0)
    imgs = torch.cat(imgs_l, dim=0)

    return embs, imgs, labels


================================================
FILE: github_adventures/dino/train.py
================================================
import argparse
import json
import pathlib

import timm
import torch
import torchvision.transforms as transforms
import tqdm
from torch.utils.data import DataLoader, SubsetRandomSampler
from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets import ImageFolder

from evaluation import compute_embedding, compute_knn
from utils import DataAugmentation, Head, Loss, MultiCropWrapper, clip_gradients


def main():
    parser = argparse.ArgumentParser(
        "DINO training CLI",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("-b", "--batch-size", type=int, default=32)
    parser.add_argument(
        "-d", "--device", type=str, choices=("cpu", "cuda"), default="cpu"
    )
    parser.add_argument("-l", "--logging-freq", type=int, default=200)
    parser.add_argument("--momentum-teacher", type=int, default=0.9995)
    parser.add_argument("-c", "--n-crops", type=int, default=4)
    parser.add_argument("-e", "--n-epochs", type=int, default=100)
    parser.add_argument("-o", "--out-dim", type=int, default=1024)
    parser.add_argument("-t", "--tensorboard-dir", type=str, default="logs")
    parser.add_argument("--clip-grad", type=float, default=2.0)
    parser.add_argument("--norm-last-layer", action="store_true")
    parser.add_argument("--batch-size-eval", type=int, default=64)
    parser.add_argument("--teacher-temp", type=float, default=0.04)
    parser.add_argument("--student-temp", type=float, default=0.1)
    parser.add_argument("--pretrained", action="store_true")
    parser.add_argument("-w", "--weight-decay", type=float, default=0.4)

    args = parser.parse_args()
    print(vars(args))
    # Parameters
    vit_name, dim = "vit_deit_small_patch16_224", 384
    path_dataset_train = pathlib.Path("data/imagenette2-320/train")
    path_dataset_val = pathlib.Path("data/imagenette2-320/val")
    path_labels = pathlib.Path("data/imagenette_labels.json")

    logging_path = pathlib.Path(args.tensorboard_dir)
    device = torch.device(args.device)

    n_workers = 4

    # Data related
    with path_labels.open("r") as f:
        label_mapping = json.load(f)

    transform_aug = DataAugmentation(size=224, n_local_crops=args.n_crops - 2)
    transform_plain = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
            transforms.Resize((224, 224)),
        ]
    )

    dataset_train_aug = ImageFolder(path_dataset_train, transform=transform_aug)
    dataset_train_plain = ImageFolder(path_dataset_train, transform=transform_plain)
    dataset_val_plain = ImageFolder(path_dataset_val, transform=transform_plain)

    if dataset_train_plain.classes != dataset_val_plain.classes:
        raise ValueError("Inconsistent classes")

    data_loader_train_aug = DataLoader(
        dataset_train_aug,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=n_workers,
        pin_memory=True,
    )
    data_loader_train_plain = DataLoader(
        dataset_train_plain,
        batch_size=args.batch_size_eval,
        drop_last=False,
        num_workers=n_workers,
    )
    data_loader_val_plain = DataLoader(
        dataset_val_plain,
        batch_size=args.batch_size_eval,
        drop_last=False,
        num_workers=n_workers,
    )
    data_loader_val_plain_subset = DataLoader(
        dataset_val_plain,
        batch_size=args.batch_size_eval,
        drop_last=False,
        sampler=SubsetRandomSampler(list(range(0, len(dataset_val_plain), 50))),
        num_workers=n_workers,
    )

    # Logging
    writer = SummaryWriter(logging_path)
    writer.add_text("arguments", json.dumps(vars(args)))

    # Neural network related
    student_vit = timm.create_model(vit_name, pretrained=args.pretrained)
    teacher_vit = timm.create_model(vit_name, pretrained=args.pretrained)

    student = MultiCropWrapper(
        student_vit,
        Head(
            dim,
            args.out_dim,
            norm_last_layer=args.norm_last_layer,
        ),
    )
    teacher = MultiCropWrapper(teacher_vit, Head(dim, args.out_dim))
    student, teacher = student.to(device), teacher.to(device)

    teacher.load_state_dict(student.state_dict())

    for p in teacher.parameters():
        p.requires_grad = False

    # Loss related
    loss_inst = Loss(
        args.out_dim,
        teacher_temp=args.teacher_temp,
        student_temp=args.student_temp,
    ).to(device)
    lr = 0.0005 * args.batch_size / 256
    optimizer = torch.optim.AdamW(
        student.parameters(),
        lr=lr,
        weight_decay=args.weight_decay,
    )

    # Training loop
    n_batches = len(dataset_train_aug) // args.batch_size
    best_acc = 0
    n_steps = 0

    for e in range(args.n_epochs):
        for i, (images, _) in tqdm.tqdm(
            enumerate(data_loader_train_aug), total=n_batches
        ):
            if n_steps % args.logging_freq == 0:
                student.eval()

                # Embedding
                embs, imgs, labels_ = compute_embedding(
                    student.backbone,
                    data_loader_val_plain_subset,
                )
                writer.add_embedding(
                    embs,
                    metadata=[label_mapping[l] for l in labels_],
                    label_img=imgs,
                    global_step=n_steps,
                    tag="embeddings",
                )

                # KNN
                current_acc = compute_knn(
                    student.backbone,
                    data_loader_train_plain,
                    data_loader_val_plain,
                )
                writer.add_scalar("knn-accuracy", current_acc, n_steps)
                if current_acc > best_acc:
                    torch.save(student, logging_path / "best_model.pth")
                    best_acc = current_acc

                student.train()

            images = [img.to(device) for img in images]

            teacher_output = teacher(images[:2])
            student_output = student(images)

            loss = loss_inst(student_output, teacher_output)

            optimizer.zero_grad()
            loss.backward()
            clip_gradients(student, args.clip_grad)
            optimizer.step()

            with torch.no_grad():
                for student_ps, teacher_ps in zip(
                    student.parameters(), teacher.parameters()
                ):
                    teacher_ps.data.mul_(args.momentum_teacher)
                    teacher_ps.data.add_(
                        (1 - args.momentum_teacher) * student_ps.detach().data
                    )

            writer.add_scalar("train_loss", loss, n_steps)

            n_steps += 1


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/dino/utils.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image


class DataAugmentation:
    """Create crops of an input image together with additional augmentation.

    It generates 2 global crops and `n_local_crops` local crops.

    Parameters
    ----------
    global_crops_scale : tuple
        Range of sizes for the global crops.

    local_crops_scale : tuple
        Range of sizes for the local crops.

    n_local_crops : int
        Number of local crops to create.

    size : int
        The size of the final image.

    Attributes
    ----------
    global_1, global_2 : transforms.Compose
        Two global transforms.

    local : transforms.Compose
        Local transform. Note that the augmentation is stochastic so one
        instance is enough and will lead to different crops.
    """
    def __init__(
        self,
        global_crops_scale=(0.4, 1),
        local_crops_scale=(0.05, 0.4),
        n_local_crops=8,
        size=224,
    ):
        self.n_local_crops = n_local_crops
        RandomGaussianBlur = lambda p: transforms.RandomApply(  # noqa
            [transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2))],
            p=p,
        )

        flip_and_jitter = transforms.Compose(
            [
                transforms.RandomHorizontalFlip(p=0.5),
                transforms.RandomApply(
                    [
                        transforms.ColorJitter(
                            brightness=0.4,
                            contrast=0.4,
                            saturation=0.2,
                            hue=0.1,
                        ),
                    ]
                ),
                transforms.RandomGrayscale(p=0.2),
            ]
        )

        normalize = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
            ]
        )

        self.global_1 = transforms.Compose(
            [
                transforms.RandomResizedCrop(
                    size,
                    scale=global_crops_scale,
                    interpolation=Image.BICUBIC,
                ),
                flip_and_jitter,
                RandomGaussianBlur(1.0),  # always apply
                normalize,
            ],
        )

        self.global_2 = transforms.Compose(
            [
                transforms.RandomResizedCrop(
                    size,
                    scale=global_crops_scale,
                    interpolation=Image.BICUBIC,
                ),
                flip_and_jitter,
                RandomGaussianBlur(0.1),
                transforms.RandomSolarize(170, p=0.2),
                normalize,
            ],
        )

        self.local = transforms.Compose(
            [
                transforms.RandomResizedCrop(
                    size,
                    scale=local_crops_scale,
                    interpolation=Image.BICUBIC,
                ),
                flip_and_jitter,
                RandomGaussianBlur(0.5),
                normalize,
            ],
        )

    def __call__(self, img):
        """Apply transformation.

        Parameters
        ----------
        img : PIL.Image
            Input image.

        Returns
        -------
        all_crops : list
            List of `torch.Tensor` representing different views of
            the input `img`.
        """
        all_crops = []
        all_crops.append(self.global_1(img))
        all_crops.append(self.global_2(img))

        all_crops.extend([self.local(img) for _ in range(self.n_local_crops)])

        return all_crops


class Head(nn.Module):
    """Network hooked up to the CLS token embedding.

    Just a MLP with the last layer being normalized in a particular way.

    Parameters
    ----------
    in_dim : int
        The dimensionality of the token embedding.

    out_dim : int
        The dimensionality of the final layer (we compute the softmax over).

    hidden_dim : int
        Dimensionality of the hidden layers.

    bottleneck_dim : int
        Dimensionality of the second last layer.

    n_layers : int
        The number of layers.

    norm_last_layer : bool
        If True, then we freeze the norm of the weight of the last linear layer
        to 1.

    Attributes
    ----------
    mlp : nn.Sequential
        Vanilla multi-layer perceptron.

    last_layer : nn.Linear
        Reparametrized linear layer with weight normalization. That means
        that that it will have `weight_g` and `weight_v` as learnable
        parameters instead of a single `weight`.
    """

    def __init__(
        self,
        in_dim,
        out_dim,
        hidden_dim=512,
        bottleneck_dim=256,
        n_layers=3,
        norm_last_layer=False,
    ):
        super().__init__()
        if n_layers == 1:
            self.mlp = nn.Linear(in_dim, bottleneck_dim)
        else:
            layers = [nn.Linear(in_dim, hidden_dim)]
            layers.append(nn.GELU())
            for _ in range(n_layers - 2):
                layers.append(nn.Linear(hidden_dim, hidden_dim))
                layers.append(nn.GELU())
            layers.append(nn.Linear(hidden_dim, bottleneck_dim))
            self.mlp = nn.Sequential(*layers)

        self.apply(self._init_weights)

        self.last_layer = nn.utils.weight_norm(
            nn.Linear(bottleneck_dim, out_dim, bias=False)
        )
        self.last_layer.weight_g.data.fill_(1)
        if norm_last_layer:
            self.last_layer.weight_g.requires_grad = False

    def _init_weights(self, m):
        """Initialize learnable parameters."""
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, std=0.02)
            if m.bias is not None:
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Of shape `(n_samples, in_dim)`.

        Returns
        -------
        torch.Tensor
            Of shape `(n_samples, out_dim)`.
        """
        x = self.mlp(x)  # (n_samples, bottleneck_dim)
        x = nn.functional.normalize(x, dim=-1, p=2)  # (n_samples, bottleneck_dim)
        x = self.last_layer(x)  # (n_samples, out_dim)

        return x


class MultiCropWrapper(nn.Module):
    """Convenience class for forward pass of multiple crops.

    Parameters
    ----------
    backbone : timm.models.vision_transformer.VisionTransformer
        Instantiated Vision Transformer. Note that we will take the `head`
        attribute and replace it with `nn.Identity`.

    new_head : Head
        New head that is going to be put on top of the `backbone`.
    """
    def __init__(self, backbone, new_head):
        super().__init__()
        backbone.head = nn.Identity()  # deactivate original head
        self.backbone = backbone
        self.new_head = new_head

    def forward(self, x):
        """Run the forward pass.

        The different crops are concatenated along the batch dimension
        and then a single forward pass is fun. The resulting tensor
        is then chunked back to per crop tensors.

        Parameters
        ----------
        x : list
            List of `torch.Tensor` each of shape `(n_samples, 3, size, size)`.

        Returns
        -------
        tuple
            Tuple of `torch.Tensor` each of shape `(n_samples, out_dim)` where
            `output_dim` is determined by `Head`.
        """
        n_crops = len(x)
        concatenated = torch.cat(x, dim=0)  # (n_samples * n_crops, 3, size, size)
        cls_embedding = self.backbone(concatenated)  # (n_samples * n_crops, in_dim)
        logits = self.new_head(cls_embedding)  # (n_samples * n_crops, out_dim)
        chunks = logits.chunk(n_crops)  # n_crops * (n_samples, out_dim)

        return chunks


class Loss(nn.Module):
    """The loss function.

    We subclass the `nn.Module` becuase we want to create a buffer for the
    logits center of the teacher.

    Parameters
    ----------
    out_dim : int
        The dimensionality of the final layer (we computed the softmax over).

    teacher_temp, student_temp : float
        Softmax temperature of the teacher resp. student.

    center_momentum : float
        Hyperparameter for the exponential moving average that determines
        the center logits. The higher the more the running average matters.
    """
    def __init__(
        self, out_dim, teacher_temp=0.04, student_temp=0.1, center_momentum=0.9
    ):
        super().__init__()
        self.student_temp = student_temp
        self.teacher_temp = teacher_temp
        self.center_momentum = center_momentum
        self.register_buffer("center", torch.zeros(1, out_dim))

    def forward(self, student_output, teacher_output):
        """Evaluate loss.

        Parameters
        ----------
        student_output, teacher_output : tuple
            Tuple of tensors of shape `(n_samples, out_dim)` representing
            logits. The length is equal to number of crops.
            Note that student processed all crops and that the two initial crops
            are the global ones.

        Returns
        -------
        loss : torch.Tensor
            Scalar representing the average loss.
        """
        student_temp = [s / self.student_temp for s in student_output]
        teacher_temp = [(t - self.center) / self.teacher_temp for t in teacher_output]

        student_sm = [F.log_softmax(s, dim=-1) for s in student_temp]
        teacher_sm = [F.softmax(t, dim=-1).detach() for t in teacher_temp]

        total_loss = 0
        n_loss_terms = 0

        for t_ix, t in enumerate(teacher_sm):
            for s_ix, s in enumerate(student_sm):
                if t_ix == s_ix:
                    continue

                loss = torch.sum(-t * s, dim=-1)  # (n_samples,)
                total_loss += loss.mean()  # scalar
                n_loss_terms += 1

        total_loss /= n_loss_terms
        self.update_center(teacher_output)

        return total_loss

    @torch.no_grad()
    def update_center(self, teacher_output):
        """Update center used for teacher output.

        Compute the exponential moving average.

        Parameters
        ----------
        teacher_output : tuple
            Tuple of tensors of shape `(n_samples, out_dim)` where each
            tensor represents a different crop.
        """
        batch_center = torch.cat(teacher_output).mean(
            dim=0, keepdim=True
        )  # (1, out_dim)
        self.center = self.center * self.center_momentum + batch_center * (
            1 - self.center_momentum
        )

def clip_gradients(model, clip=2.0):
    """Rescale norm of computed gradients.

    Parameters
    ----------
    model : nn.Module
        Module.

    clip : float
        Maximum norm.
    """
    for p in model.parameters():
        if p.grad is not None:
            param_norm = p.grad.data.norm(2)
            clip_coef = clip / (param_norm + 1e-6)
            if clip_coef < 1:
                p.grad.data.mul_(clip_coef)


================================================
FILE: github_adventures/dino/visualize_attentions.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a3bd5ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ipywidgets\n",
    "import matplotlib.pyplot as plt\n",
    "import timm\n",
    "import torch\n",
    "from torchvision.datasets import ImageFolder\n",
    "import torchvision.transforms as transforms\n",
    "from torchvision.utils import make_grid\n",
    "import torch.nn.functional as F"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a6eaa0ef",
   "metadata": {},
   "source": [
    "# Helpers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c0b2e7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_last_attention(backbone, x):\n",
    "    \"\"\"Get the attention weights of CLS from the last self-attention layer.\n",
    "\n",
    "    Very hacky!\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    backbone : timm.models.vision_transformer.VisionTransformer\n",
    "        Instantiated Vision Transformer. Note that we will in-place\n",
    "        take the `head` attribute and replace it with `nn.Identity`.\n",
    "\n",
    "    x : torch.Tensor\n",
    "        Batch of images of shape `(n_samples, 3, size, size)`.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    torch.Tensor\n",
    "        Attention weights `(n_samples, n_heads, n_patches)`.\n",
    "    \"\"\"\n",
    "    attn_module = backbone.blocks[-1].attn\n",
    "    n_heads = attn_module.num_heads\n",
    "\n",
    "    # define hook\n",
    "    inp = None\n",
    "    def fprehook(self, inputs):\n",
    "        nonlocal inp\n",
    "        inp = inputs[0]\n",
    "\n",
    "    # Register a hook\n",
    "    handle = attn_module.register_forward_pre_hook(fprehook)\n",
    "\n",
    "    # Run forward pass\n",
    "    _ = backbone(x)\n",
    "    handle.remove()\n",
    "\n",
    "    B, N, C = inp.shape\n",
    "    qkv = attn_module.qkv(inp).reshape(B, N, 3, n_heads, C // n_heads).permute(2, 0, 3, 1, 4)\n",
    "    q, k, v = qkv[0], qkv[1], qkv[2]\n",
    "\n",
    "    attn = (q @ k.transpose(-2, -1)) * attn_module.scale\n",
    "    attn = attn.softmax(dim=-1)\n",
    "\n",
    "    return attn[:, :, 0, 1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57b72b84",
   "metadata": {},
   "outputs": [],
   "source": [
    "def threshold(attn, k=30):\n",
    "    n_heads = len(attn)\n",
    "    indices = attn.argsort(dim=1, descending=True)[:, k:]\n",
    "\n",
    "    for head in range(n_heads):\n",
    "        attn[head, indices[head]] = 0\n",
    "\n",
    "    attn /= attn.sum(dim=1, keepdim=True)\n",
    "\n",
    "    return attn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59e9009d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def visualize_attention(img, backbone, k=30):\n",
    "    \"\"\"Create attention image.\n",
    "\n",
    "    Parameteres\n",
    "    -----------\n",
    "    img : PIL.Image\n",
    "        RGB image.\n",
    "\n",
    "    backbone : timm.models.vision_transformer.VisionTransformer\n",
    "        The vision transformer.\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    new_img : torch.Tensor\n",
    "        Image of shape (n_heads, 1, height, width).\n",
    "    \"\"\"\n",
    "    # imply parameters\n",
    "\n",
    "    patch_size = backbone.patch_embed.proj.kernel_size[0]\n",
    "\n",
    "    transform = transforms.Compose([\n",
    "\n",
    "        transforms.Resize((224, 224)),\n",
    "        transforms.ToTensor(),\n",
    "        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    device = next(backbone.parameters()).device\n",
    "    x = transform(img)[None, ...].to(device)\n",
    "    attn = get_last_attention(backbone, x)[0]  # (n_heads, n_patches)\n",
    "    attn = attn / attn.sum(dim=1, keepdim=True)  # (n_heads, n_patches)\n",
    "    attn = threshold(attn, k)\n",
    "    attn = attn.reshape(-1, 14, 14)  # (n_heads, 14, 14)\n",
    "    attn = F.interpolate(attn.unsqueeze(0),\n",
    "        scale_factor=patch_size,\n",
    "        mode=\"nearest\"\n",
    "        )[0]\n",
    "\n",
    "    return attn"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df0972ec",
   "metadata": {},
   "source": [
    "# Preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6e0d987",
   "metadata": {},
   "outputs": [],
   "source": [
    "models = {\n",
    "    \"supervised\": timm.create_model(\"vit_deit_small_patch16_224\", pretrained=True),\n",
    "    \"selfsupervised\": torch.load(\"best_model.pth\", map_location=\"cpu\").backbone,\n",
    "}\n",
    "dataset = ImageFolder(\"data/imagenette2-320/val\")\n",
    "\n",
    "colors = [\"yellow\", \"red\", \"green\", \"blue\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "690e3a1f",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "@ipywidgets.interact\n",
    "def _(\n",
    "    i=ipywidgets.IntSlider(min=0, max=len(dataset) - 1, continuous_update=False),\n",
    "    k=ipywidgets.IntSlider(min=0, max=195, value=10, continuous_update=False),\n",
    "    model=ipywidgets.Dropdown(options=[\"supervised\", \"selfsupervised\"]),\n",
    "):\n",
    "    img = dataset[i][0]\n",
    "    attns = visualize_attention(img, models[model], k=k).detach()[:].permute(1, 2, 0).numpy()\n",
    "\n",
    "    tform = transforms.Compose([\n",
    "\n",
    "        transforms.Resize((224, 224)),\n",
    "    ])\n",
    "    # original image\n",
    "    plt.imshow(tform(img))\n",
    "    plt.axis(\"off\")\n",
    "    plt.show()\n",
    "\n",
    "    kwargs = {\"vmin\": 0, \"vmax\": 0.24}\n",
    "    # Attentions\n",
    "    n_heads = 6\n",
    "\n",
    "    fig, axs = plt.subplots(2, 3, figsize=(10, 7))\n",
    "    \n",
    "    for i in range(n_heads):\n",
    "        ax = axs[i // 3, i % 3]\n",
    "        ax.imshow(attns[..., i], **kwargs)\n",
    "        ax.axis(\"off\")\n",
    "        \n",
    "    plt.tight_layout()\n",
    "        \n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d83eae10",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3244, 1942, 3482, 688, 1509, 3709"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: github_adventures/dino/visualize_augmentations.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5801191a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import ipywidgets\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import torch\n",
    "from PIL import Image\n",
    "from torchvision.datasets import ImageFolder\n",
    "\n",
    "from utils import DataAugmentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad4f7f91",
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_numpy(t):\n",
    "    array = torch.clip((t * 0.224) + 0.45, 0, 1).permute(1, 2, 0).numpy()\n",
    "    return array\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db09874a",
   "metadata": {},
   "outputs": [],
   "source": [
    "transform = DataAugmentation(n_local_crops=2)\n",
    "dataset = ImageFolder(\"data/imagenette2-320/train/\", transform=transform)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48738037",
   "metadata": {},
   "outputs": [],
   "source": [
    "@ipywidgets.interact\n",
    "def _(\n",
    "    i=ipywidgets.IntSlider(min=0, max=len(dataset) - 1, continuous_update=False),\n",
    "    seed=ipywidgets.IntSlider(min=0, max=50, continuous_update=False),\n",
    "):\n",
    "    torch.manual_seed(seed)\n",
    "    all_crops, _ = dataset[i]\n",
    "    titles = [\"Global 1\", \"Global 2\", \"Local 1\", \"Local 2\"]\n",
    "    \n",
    "    original_img = np.array(Image.open(dataset.samples[i][0]))\n",
    "    _, ax_orig = plt.subplots(figsize=(15, 5))\n",
    "    ax_orig.imshow(original_img)\n",
    "    ax_orig.set_title(\"Original\")\n",
    "    ax_orig.axis(\"off\")\n",
    "    \n",
    "    \n",
    "    fig, axs = plt.subplots(2, 2, figsize=(10, 10))\n",
    "    \n",
    "    for i, title in enumerate(titles):\n",
    "        ax = axs[i // 2, i % 2]\n",
    "        ax.imshow(to_numpy(all_crops[i]))\n",
    "        ax.set_title(title)\n",
    "        ax.axis(\"off\")\n",
    "    fig.tight_layout()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: github_adventures/gpt/README.md
================================================
# GPT-2 custom implementation
## Installation

```python
pip install -r requirements.txt
```

## Launching script
To copy weights of an official model + generate some text use the script
`copy_and_generate.py`

```python
(gpt) gpt$ python copy_and_generate.py --help
usage: Copy weights of a HF model and generate text. [-h] [--sample] [-s STEPS] [-r RANDOM_STATE]
                                                     [-t TEMPERATURE] [-k TOP_K] [-v]
                                                     {gpt2,gpt2-medium,gpt2-large,distilgpt2}
                                                     initial_text

positional arguments:
  {gpt2,gpt2-medium,gpt2-large,distilgpt2}
                        Pretrained model to use
  initial_text          Initial text

optional arguments:
  -h, --help            show this help message and exit
  --sample              If True sample randomly otherwise take the most probable token (default: False)
  -s STEPS, --steps STEPS
                        Number of new tokens to generate (default: 30)
  -r RANDOM_STATE, --random-state RANDOM_STATE
                        Random state (default: None)
  -t TEMPERATURE, --temperature TEMPERATURE
                        Softmax logits temperature (default: 1)
  -k TOP_K, --top-k TOP_K
                        If specified, then selecting k most probable tokens (default: None)
  -v, --verbose         If True, then verbose (default: False)

```


================================================
FILE: github_adventures/gpt/copy_and_generate.py
================================================
import argparse
import logging

import torch

from model import GPT
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils import copy_model, generate_token

logging.basicConfig(format="[%(levelname)s] %(asctime)s %(message)s")
logger = logging.getLogger(__file__)


def main(argv=None):
    """Copy weights and generate some text."""
    parser = argparse.ArgumentParser(
        "Copy weights of a HF model and generate text.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    parser.add_argument(
        "model_name",
        type=str,
        choices=("gpt2", "gpt2-medium", "gpt2-large", "distilgpt2"),
        help="Pretrained model to use",
    )
    parser.add_argument(
        "initial_text",
        type=str,
        help="Initial text",
    )
    parser.add_argument(
        "--sample",
        action="store_true",
        help="If True sample randomly otherwise take the most probable token",
    )
    parser.add_argument(
        "-s",
        "--steps",
        default=30,
        type=int,
        help="Number of new tokens to generate",
    )
    parser.add_argument("-r", "--random-state", type=int, help="Random state")
    parser.add_argument(
        "-t",
        "--temperature",
        default=1,
        type=float,
        help="Softmax logits temperature",
    )
    parser.add_argument(
        "-k",
        "--top-k",
        type=int,
        help="If specified, then selecting k most probable tokens",
    )
    parser.add_argument(
        "-v", "--verbose", action="store_true", help="If True, then verbose"
    )

    args = parser.parse_args(argv)

    # Setup logging
    if args.verbose:
        logger.setLevel(logging.INFO)
    else:
        logger.setLevel(logging.WARNING)

    logger.info(f"CLI parameters: {vars(args)})")
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    model_official = AutoModelForCausalLM.from_pretrained(args.model_name)
    config_official = model_official.config

    our_params = [
        "vocab_size",
        "n_layer",
        "n_embd",
        "n_head",
        "n_positions",
        "attn_pdrop",
        "embd_pdrop",
        "resid_pdrop",
        "layer_norm_epsilon",
    ]

    config_ours = {k: getattr(config_official, k) for k in our_params}
    logger.info(f"Model hyperparameters: {config_ours}")

    model_ours = GPT(**config_ours)
    model_ours.eval()

    copy_model(model_official, model_ours)

    token_ixs = tokenizer(args.initial_text)["input_ids"]

    if args.random_state:
        torch.manual_seed(args.random_state)

    # Sample
    for step in range(args.steps):
        new_token_ix = generate_token(
            model_ours,
            token_ixs,
            sample=args.sample,
            top_k=args.top_k,
            temperature=args.temperature,
        )
        token_ixs.append(new_token_ix)
        logger.info(f"Step {step} done")

    text = tokenizer.decode(token_ixs)
    print(text)


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/gpt/distribution_visualizations.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "896ffe86",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ipywidgets\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import torch"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09b6e1f4",
   "metadata": {},
   "source": [
    "# <center> Applying temperature + keeping only top K values</center>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2c7442cf",
   "metadata": {},
   "source": [
    "$T=\\mbox{temperature}$ $$\\large P_i=\\frac{e^{\\frac{y_i}T}}{\\sum_{k=1}^n e^{\\frac{y_k}T}}$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95833de6",
   "metadata": {},
   "outputs": [],
   "source": [
    "@ipywidgets.interact\n",
    "def _(\n",
    "    n_tokens=ipywidgets.IntSlider(min=4, max=30, value=8, continuous_update=False),\n",
    "    random_state=ipywidgets.IntSlider(min=0, max=10, value=2, continuous_update=False),\n",
    "    temperature=ipywidgets.FloatSlider(min=0, max=10, value=1, continuous_update=False),\n",
    "    top_k=ipywidgets.IntSlider(min=1, max=20, value=8, continuous_update=False),\n",
    "    ):\n",
    "    # Preparations\n",
    "    top_k = min(top_k, n_tokens)\n",
    "    torch.manual_seed(random_state)\n",
    "    logits = 10 * torch.rand(n_tokens,)\n",
    "\n",
    "\n",
    "    # Generate original\n",
    "    probs_orig = torch.nn.functional.softmax(logits, dim=0).numpy()\n",
    "    \n",
    "    # Generate new\n",
    "    logits = logits / temperature\n",
    "    top_values, _ = torch.topk(logits, top_k)  # (top_k,)                                                                                                                                                                                 \n",
    "    logits[logits < top_values.min()] = -torch.inf       \n",
    "    probs_new = torch.nn.functional.softmax(logits, dim=0).numpy()\n",
    "\n",
    "    # Plotting\n",
    "    fig, (ax_orig, ax_new) = plt.subplots(1, 2, sharey=True, figsize=(10, 2), dpi=100)\n",
    "    x = range(n_tokens)\n",
    "\n",
    "    ax_orig.bar(x, probs_orig)\n",
    "    ax_orig.set_ylim((0, 1))\n",
    "    ax_orig.set_title(\"Original\")\n",
    "    \n",
    "    ax_new.bar(x, probs_new)\n",
    "    ax_new.set_title(\"Temperature + top K\")\n",
    "    \n",
    "    plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: github_adventures/gpt/ipython_code.py
================================================
>>> import torch
>>> from model import GPT
>>> from transformers import AutoModelForCausalLM
>>> hparams_names = [
...     "vocab_size",
...     "n_layer",
...     "n_embd",
...     "n_head",
...     "n_positions",
...     "attn_pdrop",
...     "embd_pdrop",
...     "resid_pdrop",
...     "layer_norm_epsilon",
...     ]
...
>>> model_name = "gpt2"
>>> model_official = AutoModelForCausalLM.from_pretrained(model_name, tie_word_embeddings=False)
>>> config_official = model_official.config
>>> config_official
>>> config_ours = {name: getattr(config_official, name) for name in hparams_names}
>>> config_ours
>>> model_ours = GPT(**config_ours)
>>> sum(p.numel() for p in model_ours.parameters())
>>> sum(p.numel() for p in model_official.parameters())
>>> _ = model_official.eval()
>>> _ = model_ours.eval()
>>> idx = torch.tensor([[1, 123, 52, 28]], dtype=torch.long)
>>> logits_official = model_official(idx).logits
>>> logits_ours = model_ours(idx)
>>> logits_official.shape
>>> logits_ours.shape
>>> torch.allclose(logits_ours, logits_official, rtol=0, atol=1e-3)
>>> (logits_ours - logits_official).abs().max()
>>> from utils import copy_model
>>> copy_model(model_official, model_ours)
>>> logits_official = model_official(idx).logits
>>> logits_ours = model_ours(idx)
>>> torch.allclose(logits_ours, logits_official, rtol=0, atol=1e-3)
>>> (logits_ours - logits_official).abs().max()


================================================
FILE: github_adventures/gpt/model.py
================================================
import torch
import torch.nn as nn

from transformers.activations import gelu_new


class CustomGELU(nn.Module):
    """GELU implementation taken from the `transformers`."""

    def forward(self, x):
        """Run forward pass."""
        return gelu_new(x)


class Block(nn.Module):
    """Decoder block.

    Parameters
    ----------
    n_embd : int
        Dimensionality of the embeddings.

    n_head : int
        Number of attention heads.

    n_positions : int
        Maximum number of tokens.

    attn_pdrop : float
        Probability of dropout on attention weights.

    resid_pdrop : float
        Probability of dropout after applying the MLP.

    layer_norm_epsilon : float
        Hyperparameter of layer normalization.

    Attributes
    ----------
    ln_1, ln_2 : nn.LayerNorm
        Layer norms.

    attention : nn.MultiHeadAttention
        Attention module.

    mlp : nn.Sequential
        Multilayer perceptron.

    """

    def __init__(
        self,
        *,
        n_embd,
        n_head,
        n_positions,
        attn_pdrop,
        resid_pdrop,
        layer_norm_epsilon,
    ):
        super().__init__()

        self.ln_1 = nn.LayerNorm(n_embd, eps=layer_norm_epsilon)
        self.ln_2 = nn.LayerNorm(n_embd, eps=layer_norm_epsilon)

        self.attention = nn.MultiheadAttention(
            embed_dim=n_embd,
            num_heads=n_head,
            dropout=attn_pdrop,
            bias=True,
            batch_first=True,
        )
        self.register_buffer(
            "mask",
            (1 - torch.tril(torch.ones(n_positions, n_positions))).to(
                dtype=torch.bool
            ),
        )

        self.mlp = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            CustomGELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(resid_pdrop),
        )

    def forward(self, x):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Input tensor of shape `(batch_size, n_tokens, n_embd)`.

        Returns
        -------
        torch.Tensor
            Output tensor of shape `(batch_size, n_tokens, n_embd)`.
        """
        batch_size, n_tokens, n_embd = x.shape

        x_ = self.ln_1(x)  # (batch_size, n_tokens, n_embd)

        mask = self.mask[:n_tokens, :n_tokens]  # (n_tokens, n_tokens)

        attn_out, _ = self.attention(
            x_, x_, x_, attn_mask=mask, need_weights=False
        )  # (batch_size, n_tokens, n_embd)
        x = x + attn_out  # (batch_size, n_tokens, n_embd)
        x = x + self.mlp(self.ln_2(x))  # (batch_size, n_tokens, n_embd)

        return x


class GPT(nn.Module):
    """Entire GPT model.

    Parameters
    ----------
    vocab_size : int
        Number of tokens in the vocabulary.

    n_layer : int
        Number of decoder blocks to include.

    n_embd : int
        Dimensionality of the embeddings.

    n_head : int
        Number of attention heads.

    n_positions : int
        Maximum number of tokens.

    attn_pdrop : float
        Probability of dropout on attention weights.

    embd_pdrop : float
        Probability of dropout on the sum of embeddings.

    resid_pdrop : float
        Probability of dropout after applying the MLP.

    layer_norm_epsilon : float
        Hyperparameter of layer normalization.

    Attributes
    ----------
    token_emb : nn.Embedding
        Token embeddings.

    pos_emb : nn.Embedding
        Positional embedding.

    drop : nn.Dropout
        Dropout module to be applied on the sum of embeddings.

    blocks : nn.Sequential
        List of decoder blocks.

    ln : nn.LayerNorm
        Layer norm applied before applying `head`.

    head : nn.Linear
        Final linear layer.
    """

    def __init__(
        self,
        *,
        vocab_size,
        n_layer,
        n_embd,
        n_head,
        n_positions,
        attn_pdrop,
        embd_pdrop,
        resid_pdrop,
        layer_norm_epsilon,
    ):
        super().__init__()
        self.n_positions = n_positions
        self.token_emb = nn.Embedding(vocab_size, n_embd)
        self.pos_emb = nn.Embedding(n_positions, n_embd)

        self.drop = nn.Dropout(embd_pdrop)

        self.blocks = nn.Sequential(
            *[
                Block(
                    n_embd=n_embd,
                    n_head=n_head,
                    n_positions=n_positions,
                    attn_pdrop=attn_pdrop,
                    resid_pdrop=resid_pdrop,
                    layer_norm_epsilon=layer_norm_epsilon,
                )
                for _ in range(n_layer)
            ]
        )
        self.ln = nn.LayerNorm(n_embd, eps=layer_norm_epsilon)
        self.head = nn.Linear(n_embd, vocab_size, bias=False)

    def forward(self, idx):
        """Run forward pass.

        Parameters
        ----------
        idx : torch.Tensor
            Integer tensor of shape `(batch_size, n_tokens)` where each
            element is in the range `[0, vocab_size)`.

        Returns
        -------
        logits : torch.Tensor
            Tensor of shape `(batch_size, n_tokens, vocab_size)`.
        """
        batch_size, n_tokens = idx.shape
        device = idx.device

        if n_tokens > self.n_positions:
            raise ValueError("There are too many tokens in the input")

        positions = torch.arange(n_tokens, device=device)  # (n_tokens,)

        token_emb = self.token_emb(idx)  # (batch_size, n_tokens, n_embd)
        pos_emb = self.pos_emb(positions)[None, ...]  # (1, n_tokens, n_embd)
        x = self.drop(token_emb + pos_emb)  # (batch_size, n_tokens, n_embd)
        x = self.blocks(x)  # (batch_size, n_tokens, n_embd)
        x = self.ln(x)  # (batch_size, n_tokens, n_embd)
        logits = self.head(x)  # (batch_size, n_tokens, vocab_size)

        return logits


================================================
FILE: github_adventures/gpt/requirements.txt
================================================
ipython==7.30.1
ipywidgets==7.6.5
jupyter==1.0.0
matplotlib==3.5.1
numpy==1.21.5
torch==1.10.1
-e git+https://github.com/huggingface/transformers.git@05fa1a7ac17bb7aa07b9e0c1e138ecb31a28bbfe#egg=transformers


================================================
FILE: github_adventures/gpt/utils.py
================================================
import torch


def copy_parameter(param_official, param_ours):
    """Copy values of one tensor to another tensor.

    Parameters
    ----------
    param_official : torch.Tensor
        The value of this tensor will be copied.

    param_ours : torch.Tensor
        This tensor will be overwritten in-place with the values from
        `param_official`.
    """
    if param_official.shape != param_ours.shape:
        raise ValueError("The shapes of the provided tensors are different")

    with torch.no_grad():
        param_ours.copy_(param_official)


def copy_block(block_official, block_ours):
    """Copy all parameters within a transformer block.

    Parameters
    ----------
    block_official : transformers.models.gpt2.modeling_gpt2.GPT2Block
        Block coming from the huggingface code.

    block_ours : Block
        Our block.
    """
    b_a = block_official
    b_b = block_ours

    # LN 1
    copy_parameter(b_a.ln_1.weight, b_b.ln_1.weight)
    copy_parameter(b_a.ln_1.bias, b_b.ln_1.bias)

    # Attention
    copy_parameter(b_a.attn.c_attn.weight.T, b_b.attention.in_proj_weight)
    copy_parameter(b_a.attn.c_attn.bias, b_b.attention.in_proj_bias)

    copy_parameter(b_a.attn.c_proj.weight.T, b_b.attention.out_proj.weight)
    copy_parameter(b_a.attn.c_proj.bias, b_b.attention.out_proj.bias)

    # LN 2
    copy_parameter(b_a.ln_2.weight, b_b.ln_2.weight)
    copy_parameter(b_a.ln_2.bias, b_b.ln_2.bias)

    # MLP
    copy_parameter(b_a.mlp.c_fc.weight.T, b_b.mlp[0].weight)
    copy_parameter(b_a.mlp.c_fc.bias, b_b.mlp[0].bias)

    copy_parameter(b_a.mlp.c_proj.weight.T, b_b.mlp[2].weight)
    copy_parameter(b_a.mlp.c_proj.bias, b_b.mlp[2].bias)


def copy_model(model_official, model_ours):
    """Copy all trainable weights.

    Parameters
    ----------
    model_official : transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel
        Huggingface model.

    model_ours : GPT
        Our model.
    """
    m_a = model_official
    m_b = model_ours

    # Token and positional embeddings
    copy_parameter(m_a.transformer.wpe.weight, m_b.pos_emb.weight)
    copy_parameter(m_a.transformer.wte.weight, m_b.token_emb.weight)

    # Blocks
    for block_official, block_ours in zip(m_a.transformer.h, m_b.blocks):
        copy_block(block_official, block_ours)

    # Head
    copy_parameter(m_a.transformer.ln_f.weight, m_b.ln.weight)
    copy_parameter(m_a.transformer.ln_f.bias, m_b.ln.bias)
    copy_parameter(m_a.lm_head.weight, m_b.head.weight)


@torch.no_grad()
def generate_token(
    model, token_ixs, temperature=1.0, sample=False, top_k=None
):
    """Generate a single token given previous tokens.

    Parameters
    ----------
    model : GPT
        Our GPT model.

    token_ixs : list
        List of conditional input token ids.

    temperature : float
        The higher the more variability and vice versa.

    sample : bool
        If True, we sample from the distribution (=there is randomness). If
        False, we just take the argmax (=there is no randomness).

    top_k : int or None
        If not None then we modify the distribution to only contain the `top_k`
        most probable outcomes.

    Returns
    -------
    new_token_ix : int
        Index of the new token.
    """
    context_token_ixs = token_ixs[-model.n_positions :]
    ixs = torch.tensor(context_token_ixs).to(dtype=torch.long)[
        None, :
    ]  # (1, n_tokens)

    logits_all = model(ixs)  # (1, n_tokens, vocab_size)
    logits = logits_all[0, -1, :]  # (vocab_size,)
    logits = logits / temperature  # (vocab_size,)

    if top_k is not None:
        # Find the top k biggest elements, set the remaining elements to -inf
        top_values, _ = torch.topk(logits, top_k)  # (top_k,)
        logits[logits < top_values.min()] = -torch.inf

    probs = torch.nn.functional.softmax(logits, dim=0)  # (vocab_size,)

    if sample:
        new_token_ix = torch.multinomial(probs, num_samples=1)
    else:
        new_token_ix = probs.argmax()

    return new_token_ix.item()


================================================
FILE: github_adventures/integer/README.md
================================================
# On-line encyclopedia of integer sequences
You can use the `fetch_data.py` to download the sequences. However,
I actually found out (after filming the video) that you can literally
download all the sequences here:
https://oeis.org/wiki/Welcome#Compressed_Versions


So you should probably do that and spare their API.

# The GloVe embeddings
The one that I used in the video are located here:
https://nlp.stanford.edu/data/glove.6B.zip


================================================
FILE: github_adventures/integer/bert.py
================================================
import argparse

import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
from transformers import BertModel, BertTokenizer

from utils import create_classification_targets, train_classifier


def main(argv=None):
    parser = argparse.ArgumentParser("Evaluating BERT integer embeddings")

    parser.add_argument(
        "log_folder",
        type=str,
        help="Folder where to log results",
    )
    parser.add_argument(
        "--max-value-eval",
        type=int,
        default=500,
        help="Number of integers to run the evaluation on",
    )
    args = parser.parse_args(argv)
    model_name = "bert-base-uncased"

    # Create writer
    writer = SummaryWriter(args.log_folder)

    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    # Retrieve embeddings
    to_find = list(map(str, range(args.max_value_eval)))
    positions = np.array(tokenizer.convert_tokens_to_ids(to_find))
    unk_token_position = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
    is_valid = positions != unk_token_position

    print(
        "The following numbers are missing",
        [i for i, x in enumerate(is_valid) if not x],
    )

    arange = np.arange(args.max_value_eval)
    numbers = arange[is_valid]
    embeddings = (
        model.embeddings.word_embeddings(torch.from_numpy(positions[is_valid]))
        .detach()
        .numpy()
    )

    ys_clf = create_classification_targets(numbers)

    keys = sorted(ys_clf.keys())
    metadata = np.array([numbers] + [ys_clf[k] for k in keys]).T.tolist()
    metadata_header = ["value"] + keys

    for name, y in ys_clf.items():
        metrics = train_classifier(embeddings, y)
        for metric_name, value in metrics.items():
            writer.add_scalar(
                f"{name}/{metric_name}",
                value,
            )

    writer.add_embedding(
        embeddings,
        metadata=metadata,
        metadata_header=metadata_header,
    )


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/integer/experiments.sh
================================================
set -x

OUTPUT_PATH=results
GLOVE_PATH=glove.6B.300d.txt
SEQUENCES_PATH=raw_data.pkl
MAX_VALUE_EVAL=500

python glove.py --max-value-eval $MAX_VALUE_EVAL $GLOVE_PATH $OUTPUT_PATH/glove
python bert.py --max-value-eval $MAX_VALUE_EVAL $OUTPUT_PATH/BERT
python lstm.py \
    $SEQUENCES_PATH \
    $OUTPUT_PATH/LSTM \
    --batch-size 128 \
    --device cuda \
    --embedding-dim 128 \
    --hidden-dim 256 \
    --max-value-eval $MAX_VALUE_EVAL \
    --max-value 20000 \
    --n-epochs 20000 \
    --sequence-len 100


================================================
FILE: github_adventures/integer/fetch_data.py
================================================
import pathlib
import pickle

import requests

from joblib import Parallel, delayed, parallel_backend


def get_sequence(sequence_id):
    """Get an integer sequence from the online OEIS.

    Parameters
    ----------
    sequence_id : int
        Unique identifier for the desired sequence.

    Returns
    -------
    sequence : list
        List of integers

    Raises
    ------
    HTTPError
        Was not possible to get the given sequence
    """
    url = f"https://oeis.org/search?fmt=json&q=id:A{sequence_id:07}"
    print(sequence_id)
    response = requests.get(url)

    response.raise_for_status()

    data_str = response.json()["results"][0]["data"]
    sequence = [int(x) for x in data_str.split(",")]

    return sequence


if __name__ == "__main__":
    # Parameters
    n_sequences = 5000
    start_id = 1  # seems like 1 - 340_000 are valid sequences
    n_jobs = 64
    backend = "threading"  # "threading" or "loky"

    # Preparation
    end_id = start_id + n_sequences
    output_folder = pathlib.Path("data/")
    output_folder.mkdir(exist_ok=True, parents=True)
    output_path = output_folder / f"{start_id}_{end_id - 1}.pkl"

    with parallel_backend(backend, n_jobs=n_jobs):
        res = Parallel()(delayed(get_sequence)(i) for i in range(start_id, end_id))

    with output_path.open("wb") as f:
        pickle.dump(res, f)


================================================
FILE: github_adventures/integer/glove.py
================================================
import argparse

import numpy as np
from torch.utils.tensorboard import SummaryWriter

from utils import create_classification_targets, train_classifier


def main(argv=None):
    parser = argparse.ArgumentParser("Evaluating GloVe integer embeddings")

    parser.add_argument(
        "glove_path",
        type=str,
        help="Path to a txt file holding the GloVe embeddings",
    )
    parser.add_argument(
        "log_folder",
        type=str,
        help="Folder where to log results",
    )
    parser.add_argument(
        "--max-value-eval",
        type=int,
        default=500,
        help="Number of integers to run the evaluation on",
    )
    parser.add_argument(
        "--dim",
        type=int,
        default=300,
        help="Dimensionality of the embeddings",
    )
    args = parser.parse_args()

    # Create writer
    writer = SummaryWriter(args.log_folder)

    # Retrieve embeddings
    to_find = set(map(str, range(args.max_value_eval)))
    embeddings = np.empty((args.max_value_eval, args.dim))

    with open(args.glove_path) as f:
        for line in f:
            token, *vector_ = line.split(" ")

            if token in to_find:
                embeddings[int(token)] = list(map(float, vector_))
                to_find.remove(token)

    assert not to_find

    arange = np.arange(args.max_value_eval)
    ys_clf = create_classification_targets(arange)

    keys = sorted(ys_clf.keys())
    metadata = np.array([arange] + [ys_clf[k] for k in keys]).T.tolist()
    metadata_header = ["value"] + keys

    for name, y in ys_clf.items():
        metrics = train_classifier(embeddings, y)
        for metric_name, value in metrics.items():
            writer.add_scalar(
                f"{name}/{metric_name}",
                value,
            )

    writer.add_embedding(
        embeddings,
        metadata=metadata,
        metadata_header=metadata_header,
    )


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/integer/lstm.py
================================================
import argparse
import json
import pathlib
import pickle

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import tqdm
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from utils import (
    CustomDataset,
    Network,
    create_classification_targets,
    train_classifier,
)


def main(argv=None):
    parser = argparse.ArgumentParser("Embedding integers using LSTM")

    parser.add_argument(
        "data_path", type=str, help="Path to the pickled sequences"
    )

    parser.add_argument(
        "log_folder", type=str, help="Folder where to log results"
    )

    parser.add_argument(
        "-b", "--batch-size", type=int, default=128, help="Batch size"
    )

    parser.add_argument(
        "-d", "--dense-dim", type=int, default=256, help="Dense dimension"
    )

    parser.add_argument("--device", type=str, default="cpu", help="Device")

    parser.add_argument(
        "-e",
        "--embedding-dim",
        type=int,
        default=128,
        help="Embedding dimension",
    )

    parser.add_argument(
        "--hidden-dim", type=int, default=256, help="Hidden dimension"
    )
    parser.add_argument(
        "--max-value-eval",
        type=int,
        default=500,
        help="Evaluation limit",
    )

    parser.add_argument(
        "-m",
        "--max-value",
        type=int,
        default=20000,
        help="The maximum allowed value (non inclusive)",
    )

    parser.add_argument(
        "-n", "--n-epochs", type=int, default=100, help="Number of epochs"
    )

    parser.add_argument(
        "-l",
        "--sequence-len",
        type=int,
        default=100,
        help="The maximum length of a sequence",
    )

    args = parser.parse_args(argv)

    # Preparations
    device = torch.device(args.device)
    eval_frequency = 500

    log_folder = pathlib.Path(args.log_folder)
    model_path = log_folder / "checkpoint.pth"

    writer = SummaryWriter(log_folder)
    writer.add_text("parameters", json.dumps(vars(args)))

    # Dataset related
    data_path = pathlib.Path(args.data_path)
    with data_path.open("rb") as f:
        raw_sequences = pickle.load(f)

    dataset = CustomDataset(
        raw_sequences,
        max_value=args.max_value,
        sequence_len=args.sequence_len,
    )

    fig, ax = plt.subplots()
    ax.hist(dataset.normalized_sequences.ravel(), bins=100)
    ax.set_title(
        f"Number distribution (numbers={dataset.normalized_sequences.shape})"
    )
    writer.add_figure("number distribution", fig)

    dataloader = DataLoader(
        dataset,
        shuffle=True,
        batch_size=args.batch_size,
        pin_memory=True,
    )

    # Newtork, loss and the optimizer
    net = Network(
        max_value=args.max_value,
        hidden_dim=args.hidden_dim,
        embedding_dim=args.embedding_dim,
        dense_dim=args.dense_dim,
    )

    net.to(device)

    loss_inst = nn.CrossEntropyLoss(
        ignore_index=args.max_value,
    )

    optimizer = torch.optim.Adam(net.parameters())

    # Validation preparation
    max_value_eval = args.max_value_eval or args.max_value
    arange = np.arange(max_value_eval)
    ys_clf = create_classification_targets(arange)

    keys = sorted(ys_clf.keys())
    metadata = np.array([arange] + [ys_clf[k] for k in keys]).T.tolist()
    metadata_header = ["value"] + keys

    step = 0
    for _ in range(args.n_epochs):
        for x in tqdm.tqdm(dataloader):
            x = x.to(device)
            logits_ = net(x)  # (batch_size, sequence_len, max_value)

            logits = logits_[:, :-1].permute(
                0, 2, 1
            )  # (batch_size, max_value, sequence_len - 1)
            target = x[:, 1:]  # (batch_size, sequence_len - 1)
            loss = loss_inst(logits, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            writer.add_scalar("loss", loss, step)

            if step % eval_frequency == 0:
                X = (
                    net.embedding.weight.detach()
                    .cpu()
                    .numpy()[:max_value_eval]
                )

                writer.add_embedding(
                    X,
                    global_step=step,
                    tag="Integer embeddings",
                    metadata=metadata,
                    metadata_header=metadata_header,
                )

                for name, y in ys_clf.items():
                    metrics = train_classifier(X, y)
                    for metric_name, value in metrics.items():
                        writer.add_scalar(
                            f"{name}/{metric_name}",
                            value,
                            step,
                        )
                torch.save(net, model_path)

            step += 1


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/integer/requirements.txt
================================================
joblib
matplotlib
numpy
requests
scikit-learn
sympy
tensorboard
torch
transformers


================================================
FILE: github_adventures/integer/utils.py
================================================
import numpy as np
import torch
import torch.nn as nn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sympy.ntheory import isprime
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    """Dataset containing integer sequences.

    Parameters
    ----------
    raw_sequences : list of list of str
        Containing the original raw sequences. Note
        that their length differs.

    sequence_len : int
        The lenght og the sequence. If the original sequence is shorter,
        we just pad it with `max_value`. If the original sequence is longer
        we simply cut if off.

    max_value : int
        The maximum allowed value (non inclusive). We will only consider
        sequences that had the first `sequence_len` elements in
        the range `[0, max_value)`.

    Attributes
    ----------
    normalized_sequences : np.ndarray
        2D array of shape `(n_sequences, sequence_len)`. It only contains
        sequences that had the first `sequence_len` elements in
        the range `[0, max_value)`.
    """

    def __init__(
        self,
        raw_sequences,
        sequence_len=80,
        max_value=2000,
    ):
        filtered_sequences = list(
            filter(
                lambda seq: all(
                    0 <= x < max_value for x in seq[:sequence_len]
                ),
                raw_sequences,
            )
        )

        n_sequences = len(filtered_sequences)

        self.normalized_sequences = max_value * np.ones(
            (n_sequences, sequence_len),
            dtype=np.int64,
        )

        for i, seq in enumerate(filtered_sequences):
            actual_len = min(len(seq), sequence_len)
            self.normalized_sequences[i, :actual_len] = seq[:actual_len]

    def __len__(self):
        """Get the length of the dataset."""
        return len(self.normalized_sequences)

    def __getitem__(self, ix):
        """Get a single sample of the dataset."""
        return self.normalized_sequences[ix]


class Network(nn.Module):
    """Network predicting next number in the sequence.

    Parameters
    ----------
    max_value : int
        Maximum integer value allowed inside of the sequence. We will
        generate an embedding for each of the numbers in `[0, max_value]`.

    embedding_dim : int
        Dimensionality of the integer embeddings.

    n_layers : int
        Number of layers inside of the LSTM.

    hidden_dim : int
        Dimensionality of the hidden state (LSTM).

    dense_dim : int
        Dimensionality of the dense layer.

    Attributes
    ----------
    embedding : torch.nn.Embedding
        Embeddings of all the integers.

    lstm : torch.nn.LSTM
        LSTM subnetwork. Inputs integer embeddings and outputs
        new hidden states.

    linear : torch.nn.Linear
        Inputs hidden states and tranforms them.

    classifier : torch.nn.Linear
        Inputs outputs of the `linear` and outputs the logits
        over all possible integers.
    """

    def __init__(
        self,
        max_value=2000,
        embedding_dim=100,
        n_layers=2,
        hidden_dim=64,
        dense_dim=256,
    ):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=max_value + 1,
            embedding_dim=embedding_dim,
            padding_idx=max_value,
        )

        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            batch_first=True,
        )

        self.linear = nn.Linear(
            hidden_dim,
            dense_dim,
        )

        self.classifier = nn.Linear(
            dense_dim,
            max_value,
        )

    def forward(self, x):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Input tensor of shape `(batch_size, sequence_len)` and has
            dtype `torch.long`.

        Returns
        -------
        logits : torch.Tensor
            Logits over all possible integers of shape
            `(batch_size, sequence_len, max_value)`.
        """
        emb = self.embedding(x)  # (batch_size, sequence_len, embedding_dim)
        h, *_ = self.lstm(emb)  # (batch_size, sequence_len, hidden_dim)
        dense = torch.relu(
            self.linear(h)
        )  # (batch_size, sequence_len, dense_dim)
        logits = self.classifier(
            dense
        )  # (batch_size, sequence_len, max_value)

        return logits


def train_classifier(X, y, random_state=2):
    """Cross-validate classification problem using logistic regression.

    Parameters
    ----------
    X : np.ndarray
        2D array holding the features of shape `(n_samples, n_features)`.

    y : np.ndarray
        1D array holding the classification targets of shape `(n_samples,)`.

    random_state : int
        Guaranteeing reproducibility.

    Returns
    -------
    metrics : dict
        Holds train and validation accuracy averaged over all the folds.
    """
    cv = StratifiedKFold(
        n_splits=5,
        random_state=random_state,
        shuffle=True,
    )

    clf = make_pipeline(
        StandardScaler(),
        LogisticRegression(
            max_iter=2000,
            random_state=random_state,
        ),
    )

    res = cross_validate(
        clf,
        X,
        y,
        return_train_score=True,
        cv=cv,
    )

    metrics = {
        "train_acc": res["train_score"].mean(),
        "test_acc": res["test_score"].mean(),
    }

    return metrics


def create_classification_targets(indices):
    """Create multiple classification targets.

    They represent common properties of integers.

    Parameters
    ----------
    indices : np.ndarray
        1D array holding the integers for which we want to compute
        the targets.

    Returns
    -------
    targets : dict
        Keys are property names and the values are arrays of the same shape
        as `indices` representing whether a given integer does / does not
        have a given property.
    """

    targets = {
        "divisibility_2": (indices % 2 == 0).astype(float),
        "divisibility_3": (indices % 3 == 0).astype(float),
        "divisibility_4": (indices % 4 == 0).astype(float),
        "divisibility_5": (indices % 5 == 0).astype(float),
        "divisibility_10": (indices % 10 == 0).astype(float),
        "prime": np.vectorize(isprime)(indices).astype(float),
    }

    return targets


================================================
FILE: github_adventures/lottery/README.md
================================================
# The Lottery Ticket Hypothesis
## Installation
```bash
pip install -r requirements.txt
```

## Running experiments
The training logic is implemented inside of the script `main.py`. To
get more information about the CLI run

```bash
python main.py --help
```

If you want to run an entire grid search over different hyperparameters
you can use the `parallel_launch.sh` script. Note that it depends on a tool
called `parallel` ([more info](https://www.gnu.org/software/parallel/)). Note
that the script allows for dry runs (default behavior) and progress bars.

```bash
./parallel_launch.sh
```


================================================
FILE: github_adventures/lottery/data.py
================================================
from torch.utils.data import Dataset
from torchvision.datasets import MNIST
from torchvision.transforms import Compose, Lambda, ToTensor


class MNISTDataset(Dataset):
    """MNIST dataset.

    Feature images are automatically flattened.

    Parameters
    ----------
    root : str
        Directory where the actual data is located (or downloaded to).

    train : bool
        If True the training set is returned (60_000 samples). Otherwise
        the validation set is returned (10_000 samples).

    Attributes
    ----------
    tv_dataset : MNIST
        Instance of the torchvision `MNIST` dataset class.
    """

    def __init__(self, root, train=True, download=True):
        transform = Compose(
            [
                ToTensor(),
                Lambda(lambda x: x.ravel()),
            ]
        )

        self.tv_dataset = MNIST(
            root,
            train=train,
            download=download,
            transform=transform,
        )

    def __len__(self):
        """Get the length of the dataset."""
        return len(self.tv_dataset)

    def __getitem__(self, ix):
        """Get a selected sample.

        Parameters
        ----------
        ix : int
            Index of the sample to get.

        Returns
        -------
        x : torch.Tensor
            Flattened feature tensor of shape `(784,)`.

        y : torch.Tensor
            Scalar representing the ground truth label. Number between 0 and 9.
        """
        return self.tv_dataset[ix]


================================================
FILE: github_adventures/lottery/main.py
================================================
import argparse

import torch
import torch.nn as nn
import tqdm
from torch.utils.data import DataLoader

import wandb
from data import MNISTDataset
from utils import MLP, compute_stats, copy_weights_mlp, prune_mlp, reinit_mlp


def loop_dataloader(dataloader):
    """Loop infinitely over a dataloader.

    Parameters
    ----------
    dataloader : DataLoader
        DataLoader streaming batches of samples.

    Yields
    ------
    X_batch : torch.Tensor
        Batch of features.

    y_batch : torch.Tensor
        Batch of predictions.
    """
    while True:
        for x in iter(dataloader):
            yield x


def train(
    model,
    dataloader_train,
    loss_inst,
    optimizer,
    max_iter=10_000,
    dataloader_val=None,
    val_freq=500,
):
    """Run the training loop.

    Parameters
    ----------
    model : nn.Module
        Neural network (in our case MLP).

    dataloader_train : DataLoader
        Dataloader yielding training samples.

    loss_inst : callable
        Computes the loss when called.

    optimizer : torch.optim.Optimizer
        Instance of an optimizer.

    max_iter : int
        The number of iterations we run the training for
        (= number of graident descent steps).

    dataloader_val : None or DataLoader
        Dataloader yielding validation samples. If provided it will
        also single to us that we want to track metrics.

    val_freq : int
        How often evaluation run.
    """
    iterable = loop_dataloader(dataloader_train)
    iterable = tqdm.tqdm(iterable, total=max_iter)

    it = 0
    for X_batch, y_batch in iterable:
        if it == max_iter:
            break

        logit_batch = model(X_batch)

        loss = loss_inst(logit_batch, y_batch)
        if dataloader_val is not None:
            wandb.log({"loss": loss}, step=it)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if it % val_freq == 0 and dataloader_val is not None:
            is_equal = []

            for X_batch_val, y_batch_val in dataloader_val:
                is_equal.append(
                    model(X_batch_val).argmax(dim=-1) == y_batch_val
                )

            is_equal_t = torch.cat(is_equal)
            acc = is_equal_t.sum() / len(is_equal_t)
            wandb.log({"accuracy_val": acc}, step=it)

        it += 1


def main(argv=None):
    """Create CLI and run experiments."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )

    parser.add_argument(
        "-i",
        "--max-iter",
        help="Number of iterations",
        type=int,
        default=50000,
    )
    parser.add_argument(
        "-b",
        "--batch-size",
        help="Batch size",
        type=int,
        default=60,
    )
    parser.add_argument(
        "--prune-iter",
        help="Number of prune iterations",
        type=int,
        default=1,
    )
    parser.add_argument(
        "-m",
        "--prune-method",
        help="Pruning method to employ",
        type=str,
        choices=("l1", "random"),
        default="l1",
    )
    parser.add_argument(
        "-p",
        "--prune-ratio",
        help="Percentage of weights to remove",
        type=float,
        default=0.2,
    )
    parser.add_argument(
        "--val-freq",
        help="How often to compute the validation accuracy",
        type=int,
        default=250,
    )
    parser.add_argument(
        "-r",
        "--reinitialize",
        help="If true, reinitializes randomly all weights after pruning",
        type=str,
        choices=("true", "false"),  # easy for hyperparameter search
        default="false",
    )
    parser.add_argument(
        "-s",
        "--random-state",
        help="Random state",
        type=int,
    )
    parser.add_argument(
        "--wandb-entity",
        help="W&B entity",
        type=str,
        default="mildlyoverfitted",
    )
    parser.add_argument(
        "--wandb-project",
        help="W&B project",
        type=str,
    )
    args = parser.parse_args(argv)

    wandb.init(
        project=args.wandb_project,
        entity=args.wandb_entity,
        config=vars(args),
    )
    wandb.define_metric("accuracy_val", summary="max")

    dataset_train = MNISTDataset(
        "data",
        train=True,
        download=True,
    )
    dataset_val = MNISTDataset(
        "data",
        train=False,
        download=True,
    )

    if args.random_state is not None:
        torch.manual_seed(args.random_state)

    dataloader_train = DataLoader(
        dataset_train, batch_size=args.batch_size, shuffle=True
    )
    dataloader_val = DataLoader(
        dataset_val, batch_size=args.batch_size, shuffle=True
    )

    kwargs = dict(
        n_features=28 * 28,
        hidden_layer_sizes=(300, 100),
        n_targets=10,
    )

    mlp = MLP(**kwargs)

    mlp_copy = MLP(**kwargs)
    mlp_copy.load_state_dict(mlp.state_dict())

    loss_inst = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(mlp.parameters(), lr=1.2 * 1e-3)

    # Train and prune loop
    if args.prune_ratio > 0:
        per_round_prune_ratio = 1 - (1 - args.prune_ratio) ** (
            1 / args.prune_iter
        )

        per_round_prune_ratios = [per_round_prune_ratio] * len(mlp.module_list)
        per_round_prune_ratios[-1] /= 2

        per_round_max_iter = int(args.max_iter / args.prune_iter)

        for prune_it in range(args.prune_iter):
            train(
                mlp,
                dataloader_train,
                loss_inst,
                optimizer,
                max_iter=per_round_max_iter,
            )
            prune_mlp(mlp, per_round_prune_ratios, method=args.prune_method)

            copy_weights_mlp(mlp_copy, mlp)

            stats = compute_stats(mlp)
            for name, stat in stats.items():
                summary_name = f"{name}_pruneiter={prune_it}"
                wandb.run.summary[summary_name] = stat

    if args.reinitialize == "true":
        reinit_mlp(mlp)

    # Run actual training with a final pruned network
    train(
        mlp,
        dataloader_train,
        loss_inst,
        optimizer,
        max_iter=args.max_iter,
        dataloader_val=dataloader_val,
        val_freq=args.val_freq,
    )


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/lottery/parallel_launch.sh
================================================
# Parallel parameters
N_JOBS=4
ARGS="-P$N_JOBS --header :" # arguments for parallel
# ARGS="--bar "$ARGS
ARGS="--dry-run "$ARGS

# Experiment parameters
ENTITY='mildlyoverfitted'
PROJECT='lottery_parallel_2'  # it should already exist to avoid issues

MAX_ITERS=(15000)
PRUNE_ITERS=(1 5)
PRUNE_METHODS=('l1' 'random')
PRUNE_RATIOS=(0 0.1 0.25 0.5 0.8 0.9 0.93 0.97)
REINITIALIZES=('true' 'false')
RANDOM_STATES=(1 2 3 4 5)

parallel $ARGS \
    python main.py \
        --max-iter={max_iter} \
        --prune-iter={prune_iter} \
        --prune-method={prune_method} \
        --prune-ratio={prune_ratio} \
        --random-state={random_state} \
        --reinitialize={reinitialize} \
        --wandb-entity=$ENTITY \
        --wandb-project=$PROJECT \
            ::: max_iter "${MAX_ITERS[@]}" \
            ::: prune_iter "${PRUNE_ITERS[@]}" \
            ::: prune_method "${PRUNE_METHODS[@]}" \
            ::: prune_ratio "${PRUNE_RATIOS[@]}" \
            ::: random_state "${RANDOM_STATES[@]}" \
            ::: reinitialize "${REINITIALIZES[@]}" \


================================================
FILE: github_adventures/lottery/requirements.txt
================================================
numpy
pillow
six
torch
torch-vision
tqdm
wandb


================================================
FILE: github_adventures/lottery/utils.py
================================================
import math

import torch
import torch.nn as nn
from torch.nn.utils.prune import l1_unstructured, random_unstructured


class MLP(nn.Module):
    """Multilayer perceptron.

    The bias is included in all linear layers.

    Parameters
    ----------
    n_features : int
        Number of input features (pixels inside of MNIST images).

    hidden_layer_sizes : tuple
        Tuple of ints representing sizes of the hidden layers.

    n_targets : int
        Number of target classes (10 for MNIST).

    Attributes
    ----------
    module_list : nn.ModuleList
        List holding all the linear layers in the right order.
    """

    def __init__(self, n_features, hidden_layer_sizes, n_targets):
        super().__init__()

        layer_sizes = (n_features,) + hidden_layer_sizes + (n_targets,)
        layer_list = []

        for i in range(len(layer_sizes) - 1):
            layer_list.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))

        self.module_list = nn.ModuleList(layer_list)

    def forward(self, x):
        """Run the forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Batch of features of shape `(batch_size, n_features)`.

        Returns
        -------
        torch.Tensor
            Batch of predictions (logits) of shape `(batch_size, n_targets)`.
        """
        n_layers = len(self.module_list)

        for i, layer in enumerate(self.module_list):
            x = layer(x)

            if i < n_layers - 1:
                x = nn.functional.relu(x)

        return x


def prune_linear(linear, prune_ratio=0.3, method="l1"):
    """Prune a linear layer.

    Modifies the module in-place. We make an assumption that the bias
    is included.

    Parameters
    ----------
    linear : nn.Linear
        Linear module containing a bias.

    prune_ratio : float
        Number between 0 and 1 representing the percentage of weights
        to prune.

    method : str, {"l1", "random"}
        Pruning method to use.
    """
    if method == "l1":
        prune_func = l1_unstructured
    elif method == "random":
        prune_func = random_unstructured
    else:
        raise ValueError

    prune_func(linear, "weight", prune_ratio)
    prune_func(linear, "bias", prune_ratio)


def prune_mlp(mlp, prune_ratio=0.3, method="l1"):
    """Prune each layer of the multilayer perceptron.

    Modifies the module in-place. We make an assumption that each
    linear layer has the bias included.

    Parameters
    ----------
    mlp : MLP
        Multilayer perceptron instance.

    prune_ratio : float or list
        Number between 0 and 1 representing the percentage of weights
        to prune. If `list` then different ratio for each
        layer.

    method : str, {"l1", "random"}
        Pruning method to use.
    """
    if isinstance(prune_ratio, float):
        prune_ratios = [prune_ratio] * len(mlp.module_list)
    elif isinstance(prune_ratio, list):
        if len(prune_ratio) != len(mlp.module_list):
            raise ValueError("Incompatible number of prune ratios provided")

        prune_ratios = prune_ratio
    else:
        raise TypeError

    for prune_ratio, linear in zip(prune_ratios, mlp.module_list):
        prune_linear(linear, prune_ratio=prune_ratio, method=method)


def check_pruned_linear(linear):
    """Check if a Linear module was pruned.

    We require both the bias and the weight to be pruned.

    Parameters
    ----------
    linear : nn.Linear
        Linear module containing a bias.

    Returns
    -------
    bool
        True if the model has been pruned.
    """
    params = {param_name for param_name, _ in linear.named_parameters()}
    expected_params = {"weight_orig", "bias_orig"}

    return params == expected_params


def reinit_linear(linear):
    """Reinitialize a linear layer.

    This is an in-place operation.
    If the module has some pruning logic we are not going to remove it
    and we only initialize the underlying tensors - `weight_orig` and
    `bias_orig`.

    Parameters
    ----------
    linear : nn.Linear
        Linear model containing a bias.
    """
    is_pruned = check_pruned_linear(linear)

    # Get parameters of interest
    if is_pruned:
        weight = linear.weight_orig
        bias = linear.bias_orig
    else:
        weight = linear.weight
        bias = linear.bias

    # Initialize weight
    nn.init.kaiming_uniform_(weight, a=math.sqrt(5))

    # Initialize bias
    fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weight)
    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
    nn.init.uniform_(bias, -bound, bound)


def reinit_mlp(mlp):
    """Reinitialize all layers of the MLP.

    Parameters
    ----------
    mlp : MLP
        Multi-layer perceptron.
    """
    for linear in mlp.module_list:
        reinit_linear(linear)


def copy_weights_linear(linear_unpruned, linear_pruned):
    """Copy weights from an unpruned model to a pruned model.

    Modifies `linear_pruned` in place.

    Parameters
    ----------
    linear_unpruned : nn.Linear
        Linear model with a bias that was not pruned.

    linear_pruned : nn.Linear
        Linear model with a bias that was pruned.
    """
    assert check_pruned_linear(linear_pruned)
    assert not check_pruned_linear(linear_unpruned)

    with torch.no_grad():
        linear_pruned.weight_orig.copy_(linear_unpruned.weight)
        linear_pruned.bias_orig.copy_(linear_unpruned.bias)


def copy_weights_mlp(mlp_unpruned, mlp_pruned):
    """Copy weights of an unpruned network to a pruned network.

    Modifies `mlp_pruned` in place.

    Parameters
    ----------
    mlp_unpruned : MLP
        MLP model that was not pruned.

    mlp_pruned : MLP
        MLP model that was pruned.
    """
    zipped = zip(mlp_unpruned.module_list, mlp_pruned.module_list)

    for linear_unpruned, linear_pruned in zipped:
        copy_weights_linear(linear_unpruned, linear_pruned)


def compute_stats(mlp):
    """Compute important statistics related to pruning.

    Parameters
    ----------
    mlp : MLP
        Multilayer perceptron.

    Returns
    -------
    dict
        Statistics.
    """
    stats = {}
    total_params = 0
    total_pruned_params = 0

    for layer_ix, linear in enumerate(mlp.module_list):
        assert check_pruned_linear(linear)

        weight_mask = linear.weight_mask
        bias_mask = linear.bias_mask

        params = weight_mask.numel() + bias_mask.numel()
        pruned_params = (weight_mask == 0).sum() + (bias_mask == 0).sum()

        total_params += params
        total_pruned_params += pruned_params

        stats[f"layer{layer_ix}_total_params"] = params
        stats[f"layer{layer_ix}_pruned_params"] = pruned_params
        stats[f"layer{layer_ix}_actual_prune_ratio"] = pruned_params / params

    stats["total_params"] = total_params
    stats["total_pruned_params"] = total_pruned_params
    stats["actual_prune_ratio"] = total_pruned_params / total_params

    return stats


================================================
FILE: github_adventures/mixer/README.md
================================================
Note that the `official.py` is just a copy of the
code provided in `https://arxiv.org/abs/2105.01601` and probably here
`https://github.com/google-research/vision_transformer`. Please refer to those
sources for licensing information.


================================================
FILE: github_adventures/mixer/official.py
================================================
import einops
import flax.linen as nn
import jax.numpy as jnp


class MlpBlock(nn.Module):
    mlp_dim: int

    @nn.compact
    def __call__(self, x):
        y = nn.Dense(self.mlp_dim)(x)
        y = nn.gelu(y)
        return nn.Dense(x.shape[-1])(y)


class MixerBlock(nn.Module):
    tokens_mlp_dim: int
    channels_mlp_dim: int

    @nn.compact
    def __call__(self, x):
        y = nn.LayerNorm()(x)  # (n_samples, n_patches, hidden_dim)
        y = jnp.swapaxes(y, 1, 2)
        y = MlpBlock(self.tokens_mlp_dim, name="token_mixing")(y)
        y = jnp.swapaxes(y, 1, 2)
        x = x + y
        y = nn.LayerNorm()(x)
        return x + MlpBlock(self.channels_mlp_dim, name="channel_mixing")(y)


class MlpMixer(nn.Module):
    num_classes: int
    num_blocks: int
    patch_size: int
    hidden_dim: int
    tokens_mlp_dim: int
    channels_mlp_dim: int

    @nn.compact
    def __call__(self, x):
        s = self.patch_size
        x = nn.Conv(self.hidden_dim, (s, s), strides=(s, s), name="stem")(x)
        x = einops.rearrange(x, "n h w c -> n (h w) c")
        for _ in range(self.num_blocks):
            x = MixerBlock(self.tokens_mlp_dim, self.channels_mlp_dim)(x)
        x = nn.LayerNorm(name="pre_head_layer_norm")(x)
        x = jnp.mean(x, axis=1)
        return nn.Dense(
            self.num_classes, name="head", kernel_init=nn.initializers.zeros
        )(x)


================================================
FILE: github_adventures/mixer/ours.py
================================================
import einops
import torch.nn as nn


class MlpBlock(nn.Module):
    """Multilayer perceptron.

    Parameters
    ----------
    dim : int
        Input and output dimension of the entire block. Inside of the mixer
        it will either be equal to `n_patches` or `hidden_dim`.

    mlp_dim : int
        Dimension of the hidden layer.

    Attributes
    ----------
    linear_1, linear_2 : nn.Linear
        Linear layers.

    activation : nn.GELU
        Activation.
    """

    def __init__(self, dim, mlp_dim=None):
        super().__init__()

        mlp_dim = dim if mlp_dim is None else mlp_dim
        self.linear_1 = nn.Linear(dim, mlp_dim)
        self.activation = nn.GELU()
        self.linear_2 = nn.Linear(mlp_dim, dim)

    def forward(self, x):
        """Run the forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Input tensor of shape `(n_samples, n_channels, n_patches)` or
            `(n_samples, n_patches, n_channels)`.

        Returns
        -------
        torch.Tensor
            Output tensor that has exactly the same shape as the input `x`.
        """
        x = self.linear_1(x)  # (n_samples, *, mlp_dim)
        x = self.activation(x)  # (n_samples, *, mlp_dim)
        x = self.linear_2(x)  # (n_samples, *, dim)
        return x


class MixerBlock(nn.Module):
    """Mixer block that contains two `MlpBlock`s and two `LayerNorm`s.

    Parameters
    ----------
    n_patches : int
        Number of patches the image is split up into.

    hidden_dim : int
        Dimensionality of patch embeddings.

    tokens_mlp_dim : int
        Hidden dimension for the `MlpBlock` when doing token mixing.

    channels_mlp_dim : int
        Hidden dimension for the `MlpBlock` when doing channel mixing.

    Attributes
    ----------
    norm_1, norm_2 : nn.LayerNorm
        Layer normalization.

    token_mlp_block : MlpBlock
        Token mixing MLP.

    channel_mlp_block : MlpBlock
        Channel mixing MLP.
    """

    def __init__(
        self, *, n_patches, hidden_dim, tokens_mlp_dim, channels_mlp_dim
    ):
        super().__init__()

        self.norm_1 = nn.LayerNorm(hidden_dim)
        self.norm_2 = nn.LayerNorm(hidden_dim)

        self.token_mlp_block = MlpBlock(n_patches, tokens_mlp_dim)
        self.channel_mlp_block = MlpBlock(hidden_dim, channels_mlp_dim)

    def forward(self, x):
        """Run the forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Tensor of shape `(n_samples, n_patches, hidden_dim)`.

        Returns
        -------
        torch.Tensor
            Tensor of the same shape as `x`, i.e.
            `(n_samples, n_patches, hidden_dim)`.
        """
        y = self.norm_1(x)  # (n_samples, n_patches, hidden_dim)
        y = y.permute(0, 2, 1)  # (n_samples, hidden_dim, n_patches)
        y = self.token_mlp_block(y)  # (n_samples, hidden_dim, n_patches)
        y = y.permute(0, 2, 1)  # (n_samples, n_patches, hidden_dim)
        x = x + y  # (n_samples, n_patches, hidden_dim)
        y = self.norm_2(x)  # (n_samples, n_patches, hidden_dim)
        res = x + self.channel_mlp_block(
            y
        )  # (n_samples, n_patches, hidden_dim)
        return res


class MlpMixer(nn.Module):
    """Entire network.

    Parameters
    ----------
    image_size : int
        Height and width (assuming it is a square) of the input image.

    patch_size : int
        Height and width (assuming it is a square) of the patches. Note
        that we assume that `image_size % patch_size == 0`.

    tokens_mlp_dim : int
        Hidden dimension for the `MlpBlock` when doing the token mixing.

    channels_mlp_dim : int
        Hidden dimension for the `MlpBlock` when diong the channel mixing.

    n_classes : int
        Number of classes for classification.

    hidden_dim : int
        Dimensionality of patch embeddings.

    n_blocks : int
        The number of `MixerBlock`s in the architecture.

    Attributes
    ----------
    patch_embedder : nn.Conv2D
        Splits the image up into multiple patches and then embeds each of them
        (using shared weights).

    blocks : nn.ModuleList
        List of `MixerBlock` instances.

    pre_head_norm : nn.LayerNorm
        Layer normalization applied just before the classification head.

    head_classifier : nn.Linear
        The classification head.
    """
    def __init__(
        self,
        *,
        image_size,
        patch_size,
        tokens_mlp_dim,
        channels_mlp_dim,
        n_classes,
        hidden_dim,
        n_blocks,
    ):
        super().__init__()
        n_patches = (image_size // patch_size) ** 2 # assumes divisibility

        self.patch_embedder = nn.Conv2d(
            3,
            hidden_dim,
            kernel_size=patch_size,
            stride=patch_size,
        )
        self.blocks = nn.ModuleList(
            [
                MixerBlock(
                    n_patches=n_patches,
                    hidden_dim=hidden_dim,
                    tokens_mlp_dim=tokens_mlp_dim,
                    channels_mlp_dim=channels_mlp_dim,
                )
                for _ in range(n_blocks)
            ]
        )

        self.pre_head_norm = nn.LayerNorm(hidden_dim)
        self.head_classifier = nn.Linear(hidden_dim, n_classes)

    def forward(self, x):
        """Run the forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Input batch of square images of shape
            `(n_samples, n_channels, image_size, image_size)`.

        Returns
        -------
        torch.Tensor
            Class logits of shape `(n_samples, n_classes)`.
        """
        x = self.patch_embedder(
            x
        )  # (n_samples, hidden_dim, n_patches ** (1/2), n_patches ** (1/2))
        x = einops.rearrange(
            x, "n c h w -> n (h w) c"
        )  # (n_samples, n_patches, hidden_dim)
        for mixer_block in self.blocks:
            x = mixer_block(x)  # (n_samples, n_patches, hidden_dim)

        x = self.pre_head_norm(x)  # (n_samples, n_patches, hidden_dim)
        x = x.mean(dim=1)  # (n_samples, hidden_dim)
        y = self.head_classifier(x)  # (n_samples, n_classes)

        return y


================================================
FILE: github_adventures/mixer/test_compare.py
================================================
import jax
import numpy as np
import pytest
import torch

from official import MlpMixer as OfficialMixer
from ours import MlpMixer as OurMixer


@pytest.mark.parametrize("image_size", [6, 12])
@pytest.mark.parametrize("patch_size", [2, 3])
@pytest.mark.parametrize("hidden_dim", [4, 5])
@pytest.mark.parametrize("n_blocks", [1, 2])
@pytest.mark.parametrize("n_classes", [4, 8])
@pytest.mark.parametrize("tokens_mlp_dim", [2, 4])
@pytest.mark.parametrize("channels_mlp_dim", [3, 6])
def test_compare(
    image_size,
    patch_size,
    hidden_dim,
    n_blocks,
    n_classes,
    tokens_mlp_dim,
    channels_mlp_dim,
):
    # Create Flax model
    model_flax = OfficialMixer(
        num_classes=n_classes,
        num_blocks=n_blocks,
        patch_size=patch_size,
        hidden_dim=hidden_dim,
        tokens_mlp_dim=tokens_mlp_dim,
        channels_mlp_dim=channels_mlp_dim,
    )
    key1, key2 = jax.random.split(jax.random.PRNGKey(0))
    x = jax.random.normal(key1, (11, image_size, image_size, 3))  # Dummy input
    params = model_flax.init(key2, x)  # initialization call

    n_params_flax = sum(
        jax.tree_leaves(jax.tree_map(lambda x: np.prod(x.shape), params))
    )
    shape_flax = model_flax.apply(params, x).shape

    # Create Torch model
    model_torch = OurMixer(
        image_size=image_size,
        patch_size=patch_size,
        hidden_dim=hidden_dim,
        n_blocks=n_blocks,
        n_classes=n_classes,
        tokens_mlp_dim=tokens_mlp_dim,
        channels_mlp_dim=channels_mlp_dim,
    )

    n_params_torch = sum(
        p.numel() for p in model_torch.parameters() if p.requires_grad
    )
    shape_torch = model_torch(torch.rand(11, 3, image_size, image_size)).shape

    assert n_params_flax == n_params_torch
    assert shape_flax == shape_torch == (11, n_classes)


================================================
FILE: github_adventures/mixup/launch_experiments.sh
================================================
set -x

N_EPOCHS=100000
N_SAMPLES=1000
SEED=123
TBOARD_DIR=tb_results/$SEED

python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/no_regularization
python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/weight_decay --weight-decay 0.6
python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/dropout -p 0.2 
python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/mixup --mixup 
python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/input_mixup -k 0 1 --mixup
python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/hidden_layers_mixup -k 1 4 --mixup 


================================================
FILE: github_adventures/mixup/train.py
================================================
import argparse
import json

import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from utils import (
    CustomDataset,
    MLPClassifierMixup,
    generate_prediction_img,
    generate_spirals,
)


def main(argv=None):
    parser = argparse.ArgumentParser("Training")

    # Parameters
    parser.add_argument(
        "logpath",
        type=str,
    )
    parser.add_argument(
        "-b",
        "--batch-size",
        type=int,
        default=32,
        help="Batch size",
    )
    parser.add_argument(
        "--mixup",
        action="store_true",
    )
    parser.add_argument(
        "-p",
        "--dropout-probability",
        type=float,
        default=0,
        help="The probability of dropout",
    )
    parser.add_argument(
        "--hidden-dims",
        nargs="+",
        type=int,
        default=(32, 32, 32),
        help="Hidden dimensions of the MLP",
    )
    parser.add_argument(
        "-c",
        "--n-cycles",
        type=float,
        default=2,
        help="Number of cycles when creating the spiral dataset",
    )
    parser.add_argument(
        "-n",
        "--n-epochs",
        type=int,
        default=100,
        help="Number of epochs",
    )
    parser.add_argument(
        "-k",
        "--mixing-layer",
        type=int,
        nargs=2,
        default=(None, None),
        help="The range of k to sample from",
    )
    parser.add_argument(
        "-s",
        "--n-samples",
        type=int,
        default=1000,
        help="Number of samples",
    )
    parser.add_argument(
        "-r",
        "--random-state",
        type=int,
        default=5,
        help="Random state",
    )
    parser.add_argument(
        "--weight-decay",
        type=float,
        default=0.0,
        help="Weight decay",
    )

    args = parser.parse_args(argv)

    device = torch.device("cpu")
    dtype = torch.float32

    np.random.seed(args.random_state)
    torch.manual_seed(args.random_state)

    # Dataset preparation
    X, y = generate_spirals(
        args.n_samples,
        noise_std=0,
        n_cycles=args.n_cycles,
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.9,
        shuffle=True,
        stratify=y,
    )

    X_test_t = torch.from_numpy(X_test).to(device, dtype)

    dataset_train = CustomDataset(X_train, y_train)

    dataloader_train = DataLoader(
        dataset_train,
        batch_size=2 * args.batch_size,
        drop_last=True,
        shuffle=True,
    )

    # Model and loss definition
    model = MLPClassifierMixup(
        n_features=2,
        hidden_dims=tuple(args.hidden_dims),
        p=args.dropout_probability,
    )
    model.to(device, dtype)

    optimizer = torch.optim.AdamW(
        model.parameters(),
        weight_decay=args.weight_decay,
    )

    loss_fn = torch.nn.BCEWithLogitsLoss()

    # Summary
    writer = SummaryWriter(args.logpath)
    writer.add_text("hparams", json.dumps(vars(args)))

    # Training + evaluation loop
    bs = args.batch_size
    n_steps = 0
    for e in range(args.n_epochs):
        for X_batch, y_batch in dataloader_train:
            X_batch, y_batch = X_batch.to(device, dtype), y_batch.to(
                device, dtype
            )
            if args.mixup:
                k_min, k_max = args.mixing_layer
                k_min = k_min or 0
                k_max = k_max or model.n_hidden + 1

                k = np.random.randint(k_min, k_max)
                lam = np.random.beta(2, 2)
                writer.add_scalar("k", k, n_steps)
                writer.add_scalar("lambda", lam, n_steps)

                h = model(X_batch, start=0, end=k)  # (2 * batch_size, *)

                h_mixed = lam * h[:bs] + (1 - lam) * h[bs:]  # (batch_size, *)
                y_mixed = lam * y_batch[:bs] + (1 - lam) * y_batch[bs:]  # (batch_size,)

                logits = model(h_mixed, start=k, end=None)  # (batch_size, 1)
                loss = loss_fn(logits.squeeze(), y_mixed)

            else:
                logits = model(X_batch[:bs])  # (batch_size, 1)
                loss = loss_fn(logits.squeeze(), y_batch[:bs])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Logging
            writer.add_scalar("loss_train", loss, n_steps)

            if n_steps % 2500 == 0:
                model.eval()
                fig_gen = generate_prediction_img(
                    model,
                    X_train,
                    X_test,
                    y_train,
                    y_test,
                )
                writer.add_figure("test", next(fig_gen))
                writer.add_figure("contour", next(fig_gen), n_steps)
                writer.add_figure("contour_train", next(fig_gen), n_steps)

                with torch.no_grad():
                    logits_test = model(X_test_t).squeeze().detach().cpu()

                acc_test = (
                    torch.sigmoid(logits_test).round().numpy() == y_test
                ).sum() / len(y_test)
                loss_test = loss_fn(logits_test, torch.from_numpy(y_test))

                writer.add_scalar("loss_test", loss_test, n_steps)
                writer.add_scalar("accuracy_test", acc_test, n_steps)

                model.train()

            n_steps += 1


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/mixup/utils.py
================================================
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from matplotlib.colors import ListedColormap
from torch.utils.data import Dataset


class MLPClassifierMixup(nn.Module):
    """Multilayer perceptron with inbuilt mixup logic.

    Assuming binary classification.

    Parameters
    ----------
    n_features : int
        Number of features.

    hidden_dims : tuple
        The sizes of the hidden layers.

    p : float
        Dropout probability.

    Attributes
    ----------
    hidden_layers : nn.ModuleList
        List of hidden layers that are each composed of a `Linear`,
        `LeakyReLU` and `Dropout` modules.

    n_hidden : int
        Number of hidden layers.

    clf : nn.Linear
        The classifier at the end of the pipeline.
    """

    def __init__(self, n_features, hidden_dims, p=0):
        super().__init__()
        dims = (n_features,) + hidden_dims

        self.n_hidden = len(hidden_dims)
        self.hidden_layers = nn.ModuleList(
            [
                nn.Sequential(
                    nn.Linear(dims[i], dims[i + 1]),
                    nn.LeakyReLU(0.2),
                    nn.Dropout(p),
                )
                for i in range(self.n_hidden)
            ]
        )
        self.clf = nn.Linear(dims[-1], 1)

    def forward(self, x, start=0, end=None):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Input of shape `(n_samples, dim)`. Note that the dim
            will depend on `start`.

        start : int
            The hidden layer where the forward pass starts (inclusive). We
            use a convention of `start=0` and `end=0` as a noop and the input
            tensor is returned. Useful for implementing input mixing.

        end : int or None
            The ending hidden layer (exclusive). If None, then always run until
            the last hidden layer and then we also apply the classifier.
        """
        for module in self.hidden_layers[start:end]:
            x = module(x)

        if end is None:
            x = self.clf(x)

        return x


class CustomDataset(Dataset):
    """Custom classification dataset assuming we have X and y loaded in memory.

    Parameters
    ----------
    X : np.ndarray
        Features of shape `(n_samples, n_features)`.

    y : np.ndarray
        Targets of shape `(n_samples,)`.
    """

    def __init__(self, X, y):
        if len(X) != len(y):
            raise ValueError("Inconsistent number of samples")

        classes = np.unique(y)
        if not np.array_equal(np.sort(classes), np.array([0, 1])):
            raise ValueError

        self.X = X
        self.y = y

    def __len__(self):
        """Compute the length of the dataset."""
        return len(self.X)

    def __getitem__(self, ix):
        """Return a single sample."""
        return self.X[ix], self.y[ix]


def generate_spirals(
    n_samples,
    noise_std=0.05,
    n_cycles=2,
    random_state=None,
):
    """Generate two spirals dataset.

    Parameters
    ----------
    n_samples : int
        Number of samples to generate. For simplicity, an even number
        is required. The targets (2 spirals) are perfectly balanced.

    noise_std : float
        Standard deviation of the noise added to the spirals.

    n_cycles : int
        Number of revolutions the spirals make.

    random_state : int or None
        Controls randomness.

    Returns
    -------
    X : np.ndarray
        Features of shape `(n_samples, n_features)`.

    y : np.ndarray
        Targets of shape `(n_samples,)`. There are two
        classes 0 and 1 representing the two spirals.
    """
    if n_samples % 2 != 0:
        raise ValueError("The number of samples needs to be even")

    n_samples_per_class = int(n_samples // 2)

    angle_1 = np.linspace(0, n_cycles * 2 * np.pi, n_samples_per_class)
    angle_2 = np.pi + angle_1
    radius = np.linspace(0.2, 2, n_samples_per_class)

    x_1 = radius * np.cos(angle_1)
    y_1 = radius * np.sin(angle_1)

    x_2 = radius * np.cos(angle_2)
    y_2 = radius * np.sin(angle_2)

    X = np.concatenate(
        [
            np.stack([x_1, y_1], axis=1),
            np.stack([x_2, y_2], axis=1),
        ],
        axis=0,
    )
    y = np.zeros((n_samples,))
    y[n_samples_per_class:] = 1.0

    if random_state is not None:
        np.random.seed(random_state)

    new_ixs = np.random.permutation(n_samples)

    X = X[new_ixs] + np.random.normal(
        loc=0, scale=noise_std, size=(n_samples, 2)
    )
    y = y[new_ixs]

    return X, y


def generate_prediction_img(
    model,
    X_train,
    X_test,
    y_train,
    y_test,
):
    """Generate contour and scatter plots with predictions.

    Parameters
    ----------
    model : MLPClassifierMixup
        Instance of a multilayer-perceptron.

    X_train, X_test : np.ndarray
        Trand and test features of shape `(n_samples, n_features)`.

    y_train, y_test : np.ndarray
        Train and test targets of shape `(n_samples,)`.

    Yields
    ------
    matplotlib.Figure
        Different figures.
    """
    device = next(model.parameters()).device
    dtype = next(model.parameters()).dtype

    cm = plt.cm.RdBu
    cm_bright = ListedColormap(["#FF0000", "#0000FF"])

    delta = 0.5

    xlim = (X_test[:, 0].min() - delta, X_test[:, 0].max() + delta)
    ylim = (X_test[:, 1].min() - delta, X_test[:, 1].max() + delta)

    n = 50
    xx, yy = np.meshgrid(
        np.linspace(xlim[0], xlim[1], n),
        np.linspace(ylim[0], ylim[1], n),
    )
    grid = np.stack([xx.ravel(), yy.ravel()], axis=1)

    with torch.no_grad():
        logits = model(torch.from_numpy(grid).to(device, dtype))

    probs = torch.sigmoid(logits)[:, 0].detach().cpu().numpy()

    probs = probs.reshape(xx.shape)

    fig, ax = plt.subplots(1, 1, dpi=170)

    ax.scatter(
        X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors="k"
    )
    ax.set_title("Test data")

    yield fig
    ax.cla()

    ax.contourf(xx, yy, probs, cmap=cm, alpha=0.8)
    ax.set_title("Prediction contours")

    yield fig

    ax.scatter(
        X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
    )
    ax.set_title("Train data + prediction contours")

    yield fig


================================================
FILE: github_adventures/ner_evaluation/README.md
================================================
* https://github.com/huggingface/evaluate/blob/af3c30561d840b83e54fc5f7150ea58046d6af69/metrics/seqeval/seqeval.py#L120
* https://github.com/chakki-works/seqeval/blob/cd01b5210eaa65e691c22320aba56f2be9e9fc43/seqeval/metrics/sequence_labeling.py#L1


================================================
FILE: github_adventures/ner_evaluation/ours.py
================================================
import re
import pandas as pd
from sklearn.metrics import classification_report


def check_valid(annots: list[str]) -> bool:
    allowed_pattern = re.compile(r"^(O$|B-.+$|I-.+$)")

    annots = ["O"] + annots
    n = len(annots)

    if any(allowed_pattern.match(annot) is None for annot in annots):
        return False

    for i in range(1, n):
        annot = annots[i]

        if annot.startswith("I-"):
            if annots[i - 1] == "O" or annots[i - 1][2:] != annot[2:]:
                return False


    return True

def get_etypes(annots: list[str]) -> list[None | str]:
    return [annot[2:] if annot != "O" else None for annot in annots]


def get_entities(annots: list[str]) -> list[dict[str, int | str]]:
    if not check_valid(annots):
        raise ValueError("Invalid input.")

    annots = ["O"] + annots + ["O"]
    etypes = get_etypes(annots)
    n = len(annots)

    start_patterns = {
        ("O", "B-"),  # ["O", "B-LOC"]
        ("B-", "B-"),  # ["B-PERSON", "B-LOC"]
        ("I-", "B-"),  # ["B-LOC", "I-LOC", "B-PERSON"]
    }

    end_patterns = {
        ("I-", "O"), # ["B-LOC", "I-LOC", "O"]
        ("B-", "O"), # ["B-LOC", "O"]
        ("B-", "B-"),  # ["B-PERSON", "B-LOC"]
        ("I-", "B-"),  # ["B-LOC", "I-LOC", "B-PERSON"]
    }

    entities: list[dict[str, int | str]] = []


    i = 1
    start = None

    while i < n:
        prev, curr = annots[i - 1], annots[i]
        pattern = (prev[:2], curr[:2])


        if pattern in end_patterns and start is not None:
            entities.append(
                {
                    "start": start - 1,
                    "end": i - 2,
                    "etype": etypes[i - 1],

                }
            )

            start = None

        if pattern in start_patterns:
            start = i

        i += 1

    return entities


def get_report(annots_true: list[str], annots_pred: list[str]) -> dict:
    if len(annots_true) != len(annots_pred):
        raise ValueError("Unequal lengths")

    entities_true = pd.DataFrame(get_entities(annots_true))
    entities_pred = pd.DataFrame(get_entities(annots_pred))


    entities_true = entities_true.rename(columns={"etype": "etype_true"})
    entities_pred = entities_pred.rename(columns={"etype": "etype_pred"})

    df_merge = entities_true.merge(entities_pred, on=["start", "end"], how="outer")
    df = df_merge.fillna("")

    labels = (set(df["etype_true"].tolist()) | set(df["etype_pred"].tolist())) - {""}

    report = classification_report(
        df["etype_true"],
        df["etype_pred"],
        output_dict=True,
        labels=list(labels),
    )
    return report


================================================
FILE: github_adventures/ner_evaluation/test_ours.py
================================================
import pytest
from seqeval.metrics import classification_report as cr
from seqeval.scheme import IOB2
from ours import check_valid, get_entities, get_etypes, get_report


@pytest.mark.parametrize(
    "inp,out",
    [
        ([], True),
        (["NONSENSE", "O"], False),
        (["O", "O", "O"], True),
        (["B-"], False),
        (["O", "I-ORG", "O"], False),
        (["O", "B-ORG", "I-PERSON"], False),
        (["O", "B-ORG", "B-PERSON"], True),
        (["O", "SOMETHING", "B-PERSON"], False),
        (["O-", "O", "O"], False),
        (["B-A", "O", "B-T"], True),
        (["I-a", "B-a", "B-a", "I-a", "I-a", "O"], False),
    ],
)
def test_check_valid(inp, out):
    assert check_valid(inp) == out


@pytest.mark.parametrize(
    "inp,out",
    [
        ([], []),
        (["O", "O", "O"], [None, None, None]),
        (["O", "B-ORG", "O"], [None, "ORG", None]),
        (["O", "B-ORG", "B-ORG"], [None, "ORG", "ORG"]),
        (["O", "B-PERSON", "I-PERSON"], [None, "PERSON", "PERSON"]),
        (["B-A", "O", "B-T"], ["A", None, "T"]),
    ],
)
def test_get_etypes(inp, out):
    assert get_etypes(inp) == out


@pytest.mark.parametrize(
    "inp,out",
    [
        (["O", "O", "O"], []),
        (["O", "B-ORG", "O"], [{"start": 1, "end": 1, "etype": "ORG"}]),
        (
            ["O", "B-ORG", "B-ORG"],
            [
                {"start": 1, "end": 1, "etype": "ORG"},
                {"start": 2, "end": 2, "etype": "ORG"},
            ],
        ),
        (["O", "B-PERSON", "I-PERSON"], [{"start": 1, "end": 2, "etype": "PERSON"}]),
        (
            ["B-A", "O", "B-T"],
            [
                {"start": 0, "end": 0, "etype": "A"},
                {"start": 2, "end": 2, "etype": "T"},
            ],
        ),
        (["B-LOC", "I-LOC", "I-LOC"], [{"start": 0, "end": 2, "etype": "LOC"}]),
        (
            ["B-A", "I-A", "B-T"],
            [
                {"start": 0, "end": 1, "etype": "A"},
                {"start": 2, "end": 2, "etype": "T"},
            ],
        ),
    ],
)
def test_get_entities(inp, out):
    assert get_entities(inp) == out


@pytest.mark.parametrize(
    "annots_true,annots_pred",
    [
        (
            ["O", "B-PERSON", "I-PERSON", "O"],
            ["O", "B-PERSON", "I-PERSON", "O"],
        ),
        (
            ["O", "B-PERSON", "I-PERSON", "B-LOC"],
            ["O", "B-PERSON", "I-PERSON", "O"],
        ),
        (
            ["O", "B-PERSON", "I-PERSON", "O"],
            ["O", "O", "B-PERSON", "O"],
        ),
        (
            ["O", "B-PERSON", "I-PERSON", "O"],
            ["O", "O", "B-PERSON", "O"],
        ),
        (
            ["B-PERSON", "B-LOC", "I-LOC", "B-DATE"],
            ["B-PERSON", "B-DATE", "B-PERSON", "B-DATE"],
        ),
        (
            ["B-PERSON", "I-PERSON", "I-PERSON", "O", "O", "B-LOC", "B-DATE"],
            ["B-PERSON", "I-PERSON", "I-PERSON", "O", "O", "B-LOC", "B-DATE"],
        ),
        (
            ["B-PERSON", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-LOC"],
            ["B-PERSON", "O", "B-DATE", "O", "B-LOC", "I-LOC", "I-LOC", "I-LOC"],
        ),
        (
            ["B-PERSON", "I-PERSON", "O", "B-LOC", "I-LOC", "O", "B-PERSON", "B-PERSON", "B-LOC"],
            ["B-PERSON", "I-PERSON", "O", "B-LOC", "B-LOC", "O", "B-PERSON", "B-PERSON", "B-LOC"],
        ),
    ]
)
def test_get_report(annots_true, annots_pred):
    report = get_report(annots_true, annots_pred)
    seqeval_report = cr([annots_true], [annots_pred], scheme=IOB2, mode="strict", output_dict=True)

    keys_to_delete = {"accuracy", "micro avg"}

    for rep in (report, seqeval_report):
        for key in keys_to_delete:
            try:
                rep.pop(key)
            except KeyError:
                pass


    assert report == seqeval_report


================================================
FILE: github_adventures/ner_evaluation/try.py
================================================
import pprint
import evaluate


metric = evaluate.load("seqeval")


# Tom Cruise is great
annots_true = ["B-PERSON", "I-PERSON", "O", "O"]
# annots_pred = ["B-PERSON", "I-PERSON", "O", "O"]
# annots_pred = ["O", "O", "O", "O"]
# annots_pred = ["B-PERSON", "O", "O", "O"]
annots_pred = ["B-LOCATION", "I-LOCATION", "O", "O"]


result = metric.compute(references=[annots_true], predictions=[annots_pred])

pprint.pprint(result)


================================================
FILE: github_adventures/neuron/README.md
================================================
# Installation

```bash
pip install -r requirements.txt
```

# Running training
To run the same experiments as in the video run

```bash
./launch.sh
```

However, feel free to check the contents of the `launch.sh` for single
experiments.

# Evaluation and pretrained models
This repo contains multiple pretrained models inside of `pretrained/`. They
are all `.pkl` files and they were created by pickling `solutions.Solution`
subclasses. To load them inside of Python run something along these lines

```python
import pickle

solution_path = "pretrained/invariant_ours.pkl"  # you can change this

with open(solution_path, "rb") as f:
    solution = pickle.load(f)[0]

```

You can also run any of the below scripts to reproduce the results from
the end of the video.


```bash
EPISODES=30

python evaluate_shuffling.py -e $EPISODES
python evaluate_noise.py -e $EPISODES
python evaluate_video.py -e $EPISODES
```


================================================
FILE: github_adventures/neuron/evaluate_noise.py
================================================
"""Assumes you have already trained your model and you have a checkpoint."""
import argparse
import pathlib
import pickle

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from tasks import IncompatibleNFeatures, Task


def main(argv=None):
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-e",
        "--n-episodes",
        type=int,
        default=200,
    )
    args = parser.parse_args(argv)

    # Prepare solutions and tasks
    checkpoint_path = pathlib.Path("pretrained") / "invariant_official.pkl"
    assert checkpoint_path.exists()

    with checkpoint_path.open("rb") as f:
        obj = pickle.load(f)

        if len(obj) == 1:
            solution_inst = obj[0]
        elif len(obj) == 2:
            solver, solution_inst = obj
            solution_inst.set_params(solver.result.xfavorite)
        else:
            raise ValueError

    results = []

    for n_noise_features in range(0, 30, 5):
        for shuffle in [True, False]:
            print(f"{n_noise_features=}, {shuffle=}")
            task = Task(
                render=False,
                n_noise_features=n_noise_features,
                shuffle_on_reset=shuffle,
                env_seed=None,
                feature_seed=None,
            )
            for episode_ix in range(args.n_episodes):
                reward = task.rollout(solution_inst)
                results.append(
                    {
                        "n_noise_features": n_noise_features,
                        "shuffle": shuffle,
                        "episode_ix": episode_ix,
                        "reward": reward,
                    }
                )

    results_df = pd.DataFrame(results)
    fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=300)

    sns.violinplot(
        data=results_df,
        x="n_noise_features",
        y="reward",
        hue="shuffle",
        split=True,
        inner="quart",
        linewidth=1,
        palette="muted",
        ax=ax,
        scale="count",
    )
    sns.despine(left=True)
    ax.set_ylim(0, 1000)
    ax.grid(True)

    fig.tight_layout()
    fig.savefig("invariant_model_noise.png")


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/neuron/evaluate_shuffling.py
================================================
"""Assumes you have already trained your model and you have a checkpoint."""
import argparse
import pathlib
import pickle

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from tasks import IncompatibleNFeatures, Task


def main(argv=None):
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-e",
        "--n-episodes",
        type=int,
        default=200,
    )
    args = parser.parse_args(argv)

    # Prepare solutions and tasks
    checkpoints = {}

    checkpoint_folder = pathlib.Path("pretrained")
    assert checkpoint_folder.exists()

    checkpoint_paths = [
        checkpoint_folder / "linear.pkl",
        checkpoint_folder / "linear_augment.pkl",
        checkpoint_folder / "MLP.pkl",
        checkpoint_folder / "MLP_augment.pkl",
        checkpoint_folder / "invariant_ours.pkl",
        checkpoint_folder / "invariant_official.pkl",
    ]

    for path in checkpoint_paths:
        with path.open("rb") as f:
            obj = pickle.load(f)

            if len(obj) == 1:
                solution_inst = obj[0]
            elif len(obj) == 2:
                solver, solution_inst = obj
                solution_inst.set_params(solver.result.xfavorite)
            else:
                raise ValueError

        checkpoints[path.stem] = solution_inst

    results = []

    for model_name, solution_inst in checkpoints.items():
        for shuffle in [True, False]:
            print(f"{model_name=}, {shuffle=}")
            task = Task(
                render=False,
                n_noise_features=0,
                shuffle_on_reset=shuffle,
                env_seed=None,
                feature_seed=None,
            )
            for episode_ix in range(args.n_episodes):
                reward = task.rollout(solution_inst)
                results.append(
                    {
                        "model": model_name,
                        "shuffle": shuffle,
                        "episode_ix": episode_ix,
                        "reward": reward,
                    }
                )

    results_df = pd.DataFrame(results)
    fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=300)

    sns.violinplot(
        data=results_df,
        x="model",
        y="reward",
        hue="shuffle",
        split=True,
        inner="quart",
        linewidth=1,
        palette="muted",
        ax=ax,
        scale="count",
        order=sorted(checkpoints.keys()),
    )
    sns.despine(left=True)
    ax.set_ylim(0, 1000)
    ax.grid(True)

    fig.tight_layout()
    fig.savefig("all_models_shuffling.png")


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/neuron/evaluate_video.py
================================================
"""Assumes you have already trained your model and you have a checkpoint."""
import argparse
import pathlib
import pickle

from gym.wrappers import Monitor
import matplotlib.pyplot as plt

from tasks import IncompatibleNFeatures, Task


def main(argv=None):
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-e",
        "--n-episodes",
        type=int,
        default=2,
    )
    args = parser.parse_args(argv)

    # Prepare solutions and tasks
    checkpoints = {}

    checkpoint_folder = pathlib.Path("pretrained")
    assert checkpoint_folder.exists()

    checkpoint_paths = [
        checkpoint_folder / "linear.pkl",
        checkpoint_folder / "linear_augment.pkl",
        checkpoint_folder / "MLP.pkl",
        checkpoint_folder / "MLP_augment.pkl",
        checkpoint_folder / "invariant_ours.pkl",
        checkpoint_folder / "invariant_official.pkl",
    ]
    checkpoint_paths = checkpoint_paths

    for path in checkpoint_paths:
        with path.open("rb") as f:
            obj = pickle.load(f)

            if len(obj) == 1:
                solution_inst = obj[0]
            elif len(obj) == 2:
                solver, solution_inst = obj
                solution_inst.set_params(solver.result.xfavorite)
            else:
                raise ValueError

        checkpoints[path.stem] = solution_inst

    for model_name, solution_inst in checkpoints.items():
        for shuffle in [True, False]:
            for episode_ix in range(args.n_episodes):
                print(f"{model_name=}, {shuffle=}")
                task = Task(
                    render=False,
                    n_noise_features=0,
                    shuffle_on_reset=shuffle,
                    env_seed=None,
                    feature_seed=None,
                )

                task.env = Monitor(
                    task.env,
                    f"videos/{model_name}/{shuffle}/{episode_ix}/",
                )
                reward = task.rollout(solution_inst)

if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/neuron/launch.sh
================================================
OUTPUT_FOLDER=log_dir

python trainer.py --max-iter 1000 linear $OUTPUT_FOLDER/linear
python trainer.py --max-iter 1000 --shuffle-on-reset linear $OUTPUT_FOLDER/linear_augment
python trainer.py --max-iter 1000 MLP $OUTPUT_FOLDER/MLP
python trainer.py --max-iter 2000 --shuffle-on-reset MLP $OUTPUT_FOLDER/MLP_augment
python trainer.py --max-iter 14000 invariant $OUTPUT_FOLDER/invariant


================================================
FILE: github_adventures/neuron/requirements.txt
================================================
cma
gym
gym-cartpole-swingup
matplotlib
numpy
pandas
seaborn
tensorboard
torch
tqdm


================================================
FILE: github_adventures/neuron/solutions.py
================================================
import abc

import numpy as np
import torch

from torch_utils import PermutationInvariantNetwork, MLP


class Solution(abc.ABC):
    """Solution abstract class.

    Attributes
    ----------
    policy : torch.nn.Module
        Network that holds all the learnable parameters.
    """

    @abc.abstractmethod
    def clone(self, obs):
        """Create a copy of the current solution without any links to self."""

    @abc.abstractmethod
    def get_action(self, obs):
        """Determine the next action given the observation array."""

    @abc.abstractmethod
    def get_n_features(self):
        """Get the number of features expected by the model.

        If None then the model can process variable-sized feature
        vectors.
        """

    @abc.abstractmethod
    def reset(self):
        """Reset solution.

        Will be called at the beginning of each rollout.

        Does not mean we will "reinitialize" the weights of `policy`.
        """

    def get_params(self):
        """Get learnable parameters of the solution.

        Returns
        -------
        params : np.ndarray
            1D array containing all parameters.
        """
        params_l = []

        for p in self.policy.parameters():
            params_l.append(p.numpy().ravel())

        params = np.concatenate(params_l)

        return params

    def set_params(self, params):
        """Set the learnable parameters.

        Parameters
        ----------
        params : np.ndarray
            1D array containing all parameters.

        Returns
        -------
        self : Solution
        """
        start_ix, end_ix = 0, 0

        for p in self.policy.parameters():
            end_ix = start_ix + np.prod(p.shape)
            p.data = torch.from_numpy(
                params[start_ix:end_ix].reshape(p.shape)
            ).float()
            start_ix = end_ix

        return self

    def get_n_params(self):
        return len(self.get_params())


class MLPSolution(Solution):
    """Multilayer perceptron solution.

    Parameters
    ----------
    n_features : int
        Number of input features.

    hidden_layer_sizes : tuple
        Tuple of int that defines the sizes of all hidden layers.

    Attributes
    ----------
    kwargs : dict
        All parameters necessary to instantiate the class.

    policy : MLP
        Policy network - multilayer perceptron.
    """

    def __init__(self, n_features=5, hidden_layer_sizes=(16,)):
        self.kwargs = {
            "n_features": n_features,
            "hidden_layer_sizes": hidden_layer_sizes,
        }
        self.dtype = torch.float32

        self.policy = MLP(n_features, hidden_layer_sizes)
        self.policy.to(self.dtype)
        self.policy.eval()

    def clone(self):
        old_policy = self.policy
        new_solution = self.__class__(**self.kwargs)

        new_solution.policy.load_state_dict(
            old_policy.state_dict(),
        )

        return new_solution

    def get_action(self, obs):
        y = self.policy(torch.from_numpy(obs).to(self.dtype))

        action = y.item()
        return action

    def get_n_features(self):
        return self.kwargs["n_features"]

    def reset(self):
        pass


class PermutationInvariantSolution(Solution):
    """Permutation invariant solution.

    Parameters
    ----------
    n_embeddings : int
        Number of rows in the Q tensor.

    proj_dim : int
        Size of the space to which we project the K and Q tensors.

    hidden_size : int
        Dimensionality of the Q and K tensors before linear projections.

    Attributes
    ----------
    kwargs : dict
        All parameters necessary to instantiate the class

    dtype : torch.dtype
        Dtype of both the network weights and input features.

    policy : PermutationInvariantNetwork
        Policy network.

    prev_action : float
        Stores the previous action. Automatically updated each time we call
        `get_action`.
    """

    def __init__(
        self,
        n_embeddings=16,
        proj_dim=32,
        hidden_size=8,
    ):
        self.kwargs = {
            "n_embeddings": n_embeddings,
            "proj_dim": proj_dim,
            "hidden_size": hidden_size,
        }
        self.policy = PermutationInvariantNetwork(
            n_embeddings=n_embeddings,
            proj_dim=proj_dim,
            hidden_size=hidden_size,
        )
        self.dtype = torch.float32

        self.policy.to(self.dtype)
        self.policy.eval()

        self.prev_action = 0  # will be continuously updated

    def clone(self):
        old_policy = self.policy
        new_solution = self.__class__(**self.kwargs)

        new_solution.policy.load_state_dict(
            old_policy.state_dict(),
        )

        return new_solution

    def get_action(self, obs):
        y = self.policy(torch.from_numpy(obs).to(self.dtype), self.prev_action)

        action = y.item()
        self.prev_action = action

        return action

    def reset(self):
        self.policy.attention_neuron.hx = None
        self.previous_action = 0

    def get_n_features(self):
        return None


================================================
FILE: github_adventures/neuron/tasks.py
================================================
import gym
import gym_cartpole_swingup  # noqa has a sideffect
import numpy as np

N_ORIGINAL_FEATURES = 5


class IncompatibleNFeatures(Exception):
    """Raised when observation and model number of features does not match."""


class Task:
    """Cartpoleswingup task.

    Parameters
    ----------
    render : bool
        If True, we render each step into a video frame.

    shuffle_on_reset : bool
        If True, the features are randomly shuffled before each rollout.

    n_noise_features : int
        Number of noise features added to the observation vector.

    env_seed : None or int
        Random state controling the underlying `gym.Env`.

    feature_seed : None or int
        Random state controling the shuffling and noise features.

    max_episode_steps : int
        Maximum number of steps per episode (=rollout). After his number
        `done=True` automatically.

    Attributes
    ----------
    n_features : int
        Overall number of features (original + noise).

    perm_ix : np.ndarray
        1D array storing a permutation indices of the features.

    env : gym.Env
        Environment.

    rnd : RandomState
        Random state.
    """

    def __init__(
        self,
        render=False,
        shuffle_on_reset=False,
        n_noise_features=0,
        env_seed=None,
        feature_seed=None,
        max_episode_steps=1000,
    ):

        self.env = gym.make("CartPoleSwingUp-v1")
        self.env._max_episode_steps = max_episode_steps
        self.shuffle_on_reset = shuffle_on_reset
        self.render = render
        self.n_noise_features = n_noise_features

        self.n_features = N_ORIGINAL_FEATURES + n_noise_features

        self.perm_ix = np.arange(self.n_features)
        self.noise_std = 0.1

        # Set seeds
        self.env.seed(env_seed)
        self.rnd = np.random.RandomState(seed=feature_seed)

    def reset_for_rollout(self):
        """Generate a new permutation of the features.

        It is going to be called at the beginning of each episode.
        Note that the permutation stays constant throughout the episode.
        """
        self.perm_ix = np.arange(self.n_features)

        if self.shuffle_on_reset:
            self.rnd.shuffle(self.perm_ix)

    def modify_obs(self, obs):
        """Modify raw observations.

        Parameters
        ----------
        obs : np.ndarray
            Raw observation/feature array of shape `(5,)`.

        Returns
        -------
        obs_modified : np.ndarray
            Modified observation array of shape `(5 + n_noise_features,)`.
            If `shuffle_on_reset` then the order of the features is going
            to change.
        """
        noise = self.rnd.randn(self.n_noise_features) * self.noise_std
        obs_and_noise = np.concatenate([obs, noise], axis=0)
        obs_modified = obs_and_noise[self.perm_ix]

        return obs_modified

    def rollout(self, solution):
        """Run a single episode/rollout.

        Parameters
        ----------
        solution : solutions.Solution
            Instance of a solution that yields an action given an
            observation.

        Returns
        -------
        ep_reward : int
            Overall episode reward computed as a sum of per step rewards.
        """
        # sanity check
        n_features_solution = solution.get_n_features()
        n_features_task = self.n_features

        if (
            n_features_solution is not None
            and n_features_solution != n_features_task
        ):
            raise IncompatibleNFeatures

        self.reset_for_rollout()
        solution.reset()  # important for PermutationInvariantSolution

        obs = self.env.reset()
        if self.render:
            self.env.render()

        ep_reward = 0
        done = False

        while not done:
            obs_modified = self.modify_obs(obs)
            action = solution.get_action(obs_modified)
            obs, reward, done, _ = self.env.step(action)

            ep_reward += reward
            if self.render:
                self.env.render()

        return ep_reward


================================================
FILE: github_adventures/neuron/torch_utils.py
================================================
import numpy as np
import torch
import torch.nn as nn


class MLP(nn.Module):
    """Multilayer perceptron policy network.

    Parameters
    ----------
    n_features : int
        Number of input features.

    hidden_layer_sizes : tuple
        Tuple of int that defines the sizes of all hidden layers.

    Attributes
    ----------
    net : nn.Sequential
        The actual network.
    """

    def __init__(self, n_features, hidden_layer_sizes):
        super().__init__()

        layer_sizes = (n_features,) + hidden_layer_sizes + (1,)

        layers = []

        for i in range(len(layer_sizes) - 1):
            in_features = layer_sizes[i]
            out_features = layer_sizes[i + 1]
            layers.extend(
                [
                    nn.Linear(in_features, out_features),
                    nn.Tanh(),
                ]
            )

        self.net = nn.Sequential(*layers)

        for p in self.parameters():
            p.requires_grad = False


    def forward(self, obs):
        """Run forward pass.

        Parameters
        ----------
        obs : torch.Tensor
            1D tensor representing the input observation of shape
            `(n_features,)`.

        Returns
        -------
        torch.Tensor
            Scalar between -1 and 1 representing the action.
        """

        return self.net(obs[None, :])[0]


def pos_table(n_embeddings, hidden_size):
    """Create a table of positional encodings.

    Parameters
    ----------
    n_embeddings : int
        Number of rows of the table.

    hidden_size : int
        Number of columns of the table.

    Returns
    -------
    tab : np.ndarray
        2D array holding the positional encodings.
    """

    def get_angle(x, h):
        return x / np.power(10000, 2 * (h // 2) / hidden_size)

    def get_angle_vec(x):
        return [get_angle(x, j) for j in range(hidden_size)]

    tab = np.array([get_angle_vec(i) for i in range(n_embeddings)]).astype(
        float
    )
    tab[:, 0::2] = np.sin(tab[:, 0::2])
    tab[:, 1::2] = np.cos(tab[:, 1::2])

    return tab


class AttentionMatrix(nn.Module):
    """Generates attention matrix using the key and query tensors.

    Parameters
    ----------
    proj_dim : int
        Size of the space to which we project the K and Q tensors.

    hidden_size : int
        Dimensionality of the Q and K tensors before linear projections.

    scale : bool
        If True, then the attention matrix will be divided by
        `proj_dim ** (1 / 2)` elementwise.

    Attributes
    ----------
    proj_q, proj_k : torch.nn.Linear
        Linear models projecting the Q and K tensors.

    scalar : float
        Number used for scaling the attention matrix elementwise.
    """

    def __init__(self, hidden_size, proj_dim, scale=True):
        super().__init__()

        self.proj_q = nn.Linear(
            in_features=hidden_size, out_features=proj_dim, bias=False
        )
        self.proj_k = nn.Linear(
            in_features=hidden_size, out_features=proj_dim, bias=False
        )
        if scale:
            self.scalar = np.sqrt(proj_dim)
        else:
            self.scalar = 1

    def forward(self, data_q, data_k):
        """Run the forward pass.

        Parameters
        ----------
        data_q : torch.Tensor
            Query tensor of shape `(n_embeddings, hidden_size)`.

        data_k : torch.Tensor
            Key tensor of shape `(n_features, hidden_size)`.

        Returns
        -------
        attention_weights : torch.Tensor
            Attention weights (don't sum up to 1 in general) of shape
            `(n_embeddings, n_features)`.
        """
        q = self.proj_q(data_q)  # (n_embeddings, proj_dim)
        k = self.proj_k(data_k)  # (n_features, proj_dim)
        dot = q @ k.T  # (n_embeddings, n_features)
        dot_scaled = torch.div(dot, self.scalar)  # (n_embeddings, n_features)
        attention_weights = torch.tanh(
            dot_scaled
        )  # (n_embeddings, n_features)

        return attention_weights


class AttentionNeuron(nn.Module):
    """Permutation invariant layer.

    Parameters
    ----------
    n_embeddings : int
        Number of rows in the Q tensor. In our case it is equal to the length
        of the latent code `m`.

    proj_dim : int
        Size of the space to which we project the K and Q tensors.

    hidden_size : int
        The dimensionality of the Q and K tensors before linear projections.

    Attributes
    ----------
    hx : tuple or None
        If not None then a tuple of 2 hidden state tensors (LSTM specific)

    lstm : nn.LSTMCell
        LSTM cell that inputs a hidden state and an observation and
        outputs a new hidden state.

    attention_matrix : AttentionMatrix
        Attention matrix (only needs Q and K tensors).

    Q : torch.Tensor
        Query tensor that is not learnable since it is populated with
        positional encodings.
    """

    def __init__(
        self,
        n_embeddings=16,
        proj_dim=32,
        hidden_size=8,
    ):
        super().__init__()
        self.n_embeddings = n_embeddings
        self.proj_dim = proj_dim
        self.hidden_size = hidden_size

        # Modules
        self.hx = None
        self.lstm = nn.LSTMCell(input_size=2, hidden_size=hidden_size)

        self.attention_matrix = AttentionMatrix(
            hidden_size=hidden_size,
            proj_dim=proj_dim,
            scale=False,
        )

        self.register_buffer(
            "Q",
            torch.from_numpy(
                pos_table(
                    n_embeddings,
                    hidden_size,
                )
            ).float(),
        )

    def forward(self, obs, prev_action):
        """Run forward pass.

        Parameters
        ----------
        obs : torch.Tensor
            1D tensor representing the input observations of shape
            `(n_features,)`.

        prev_action : float
            Number between -1 and 1 based on what the previous action was.

        Returns
        -------
        latent_code : torch.Tensor
            1D tensor representing the latent code of shape `(n_embeddings,)`.

        attn_weights : torch.Tensor
            2D tensor of shape `(n_embeddings, n_features)` representing
            attention weights.
        """
        n_features = len(obs)
        prev_action = float(prev_action)

        obs_and_act = torch.cat(
            [
                obs[:, None],
                torch.ones(n_features, 1) * prev_action,
            ],
            dim=-1,
        )  # (n_features, 2)

        if self.hx is None:
            self.hx = (
                torch.zeros(n_features, self.hidden_size),
                torch.zeros(n_features, self.hidden_size),
            )

        self.hx = self.lstm(
            obs_and_act, self.hx
        )  # Tuple[(n_features, hidden_size)]

        data_q = self.Q  # (n_embeddings, hidden_size)
        data_k = self.hx[0]  # (n_features, hidden_size)
        data_v = obs[:, None]  # (n_features, 1)

        attn_weights = self.attention_matrix(
            data_q=data_q, data_k=data_k
        )  # (n_embeddings, n_features)

        latent_code_ = torch.tanh(attn_weights @ data_v)  # (n_embeddings, 1)
        latent_code = latent_code_.squeeze()  # (n_embeddings,)

        return latent_code, attn_weights


class PermutationInvariantNetwork(nn.Module):
    """Permutation invariant policy network.

    Parameters
    ----------
    n_embeddings : int
        Number of rows in the Q tensor.

    proj_dim : int
        Size of the space to which we project the K and Q tensors.

    hidden_size : int
        Dimensionality of the Q and K matrices before linear projections.

    Attributes
    ----------
    attention_neuron : AttentionNeuron
        Permutation invariant layer that generates latent codes.

    linear : nn.Linear
        Maps the latent code into a single number.
    """

    def __init__(
        self,
        n_embeddings=16,
        proj_dim=32,
        hidden_size=8,
    ):
        super().__init__()

        self.attention_neuron = AttentionNeuron(
            n_embeddings=n_embeddings,
            proj_dim=proj_dim,
            hidden_size=hidden_size,
        )

        self.linear = nn.Linear(n_embeddings, 1)

        for p in self.parameters():
            p.requires_grad = False

    def forward(self, obs, prev_action):
        """Run forward pass.

        Parameters
        ----------
        obs : torch.Tensor
            1D tensor representing the input observations of shape
            `(n_features,)`.

        prev_action : float
            Number between -1 and 1 based on what the previous action was.

        Returns
        -------
        y : torch.Tensor
            Scalar tensor with a value in range (-1, 1) representing the
            next action.
        """

        latent_code, _ = self.attention_neuron(
            obs, prev_action
        )  # (n_embeddings,)

        y_ = torch.tanh(self.linear(latent_code[None, :]))  # (1, 1)
        y = y_[0]  # (1,)

        return y


================================================
FILE: github_adventures/neuron/trainer.py
================================================
import argparse
import json
import multiprocessing as mp
import pathlib
import pickle
from functools import partial

import cma
import numpy as np
import tqdm
from torch.utils.tensorboard import SummaryWriter

from solutions import (
    MLPSolution,
    PermutationInvariantSolution,
)
from tasks import Task, N_ORIGINAL_FEATURES


def save(folder, n_iter, solver, solution_inst):
    """Save checkpoint.

    Parameters
    ----------
    folder : str
        Output folder.

    n_iter : int
        Iteration that corresponds to the checkpoint.

    solver : cma.CMAEvolutionStrategy
        Solver instance.

    solution_inst : Solution
        Solution instance.
    """
    folder = pathlib.Path(folder)
    folder.mkdir(parents=True, exist_ok=True)

    path = folder / f"{n_iter}.pkl"

    with path.open("wb") as f:
        obj = (solver, solution_inst)
        pickle.dump(obj, f)


def get_fitness(
    solution_inst,
    *,
    shuffle_on_reset,
    n_episodes,
    n_noise_features,
    env_seed,
    feature_seed,
):
    """Get fitness function used by the CMA optimizer/solver.

    Can be run independently on a single worker.


    Returns
    -------
    fitness : list
        List of floats of length `n_episodes` holding the per episode reward.
    """
    task = Task(
        render=False,
        shuffle_on_reset=shuffle_on_reset,
        n_noise_features=n_noise_features,
        env_seed=env_seed,
        feature_seed=feature_seed,
    )
    fitness = [task.rollout(solution_inst) for _ in range(n_episodes)]

    return fitness


def main(argv=None):
    parser = argparse.ArgumentParser(
        "Training",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    parser.add_argument(
        "solution",
        type=str,
        choices=(
            "linear",
            "MLP",
            "invariant",
        ),
    )
    parser.add_argument(
        "log_dir",
        type=str,
        help="Logging folder",
    )
    parser.add_argument(
        "--checkpoint",
        type=str,
        help="Pickled solver and solution",
    )
    parser.add_argument(
        "--env-seed",
        type=int,
    )
    parser.add_argument(
        "--eval-frequency",
        type=int,
        default=25,
    )
    parser.add_argument(
        "--feature-seed",
        type=int,
    )
    parser.add_argument(
        "-m",
        "--max-iter",
        type=int,
        default=10000,
        help="Maximum number of iterations",
    )
    parser.add_argument(
        "-e",
        "--n-episodes",
        type=int,
        default=16,
        help="Number of rollouts for fitness evaluation",
    )
    parser.add_argument(
        "-j",
        "--n-jobs",
        type=int,
        default=-1,
        help="Number of processes",
    )
    parser.add_argument(
        "-n",
        "--n-noise-features",
        type=int,
        default=0,
        help="Number of noise features",
    )
    parser.add_argument(
        "-p",
        "--population-size",
        type=int,
        default=256,
        help="Number of solutions per generation",
    )
    parser.add_argument(
        "-s",
        "--shuffle-on-reset",
        action="store_true",
        help="Shuffle features before each rollout",
    )

    args = parser.parse_args(argv)

    writer = SummaryWriter(args.log_dir)
    writer.add_text("parameters", json.dumps(vars(args)))

    # Solution map
    if args.solution == "linear":
        solution_inst = MLPSolution(
            n_features=N_ORIGINAL_FEATURES + args.n_noise_features,
            hidden_layer_sizes=tuple(),
        )

    elif args.solution == "MLP":
        solution_inst = MLPSolution(
            n_features=N_ORIGINAL_FEATURES + args.n_noise_features,
            hidden_layer_sizes=(16,),
        )

    elif args.solution == "invariant":
        solution_inst = PermutationInvariantSolution(
            n_embeddings=16,
            proj_dim=32,
            hidden_size=8,
        )

    else:
        raise ValueError

    # Prepare solver
    if args.checkpoint is None:
        x0 = np.zeros(solution_inst.get_n_params())
        solver = cma.CMAEvolutionStrategy(
            x0=x0,
            sigma0=0.1,
            inopts={
                "popsize": args.population_size,
                "seed": 42,
                "randn": np.random.randn,
            },
        )
    else:
        with open(args.checkpoint, "rb") as f:
            solver, solution_inst_ = pickle.load(f)

            assert isinstance(solution_inst, solution_inst_.__class__)

            solution_inst = solution_inst_

    get_fitness_partial = partial(
        get_fitness,
        n_episodes=args.n_episodes,
        shuffle_on_reset=args.shuffle_on_reset,
        n_noise_features=args.n_noise_features,
        env_seed=args.env_seed,
        feature_seed=args.feature_seed,
    )

    if args.n_jobs == -1:
        n_jobs = mp.cpu_count()
    else:
        n_jobs = args.n_jobs


    with mp.Pool(processes=n_jobs) as pool:
        for n_iter in tqdm.tqdm(range(args.max_iter)):
            try:
                params_set = solver.ask()
                iterable = [
                    solution_inst.clone().set_params(p) for p in params_set
                ]
                rewards = pool.map(get_fitness_partial, iterable)
                pos_fitnesses = [np.mean(r) for r in rewards]

                neg_fitnesses = [-x for x in pos_fitnesses]

                all_parameters = np.concatenate(params_set)
                metrics = {
                    "parameter_mean": all_parameters.mean(),
                    "parameter_std": all_parameters.std(),
                    "mean": np.mean(pos_fitnesses),
                    "max (generation)": np.max(pos_fitnesses),
                    "max (overall)": -solver.result.fbest,
                }

                for metric_name, metric in metrics.items():
                    writer.add_scalar(metric_name, metric, global_step=n_iter)

                if (n_iter % args.eval_frequency == 0) or (
                    n_iter == (args.max_iter - 1)
                ):
                    save(args.log_dir, n_iter, solver, solution_inst)

                solver.tell(params_set, neg_fitnesses)

            except KeyboardInterrupt:
                save(
                    args.log_dir,
                    n_iter,
                    solver,
                    solution_inst,
                )
                break


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/pondernet/experiment_1.sh
================================================
set -x 
SEED=$RANDOM
LAMBDAS=(0.1 0.3 0.5 0.7 0.9)

for lambda in ${LAMBDAS[@]}
do
	python train.py \
		--batch-size 128 \
		--beta 0.01 \
		--device cuda \
		--eval-frequency 4000 \
		--n-iter 100000 \
		--n-hidden 128 \
		--lambda-p $lambda \
		--n-elems 15 \
		results/experiment_a/$SEED/lambda_$lambda
done


================================================
FILE: github_adventures/pondernet/experiment_2.sh
================================================
set -x 
SEED=$RANDOM

python train.py \
	--batch-size 128 \
	--beta 0.01 \
	--eval-frequency 4000 \
	--device cuda \
	--lambda-p 0.2 \
	--n-elems 30 \
	--n-iter 1500000 \
	--n-hidden 128 \
	--n-nonzero 1 25 \
	results/experiment_b/$SEED


================================================
FILE: github_adventures/pondernet/requirements.txt
================================================
matplotlib
numpy
tensorboard
torch
tqdm


================================================
FILE: github_adventures/pondernet/train.py
================================================
from argparse import ArgumentParser
import json
import pathlib

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

from utils import (
    ParityDataset,
    PonderNet,
    ReconstructionLoss,
    RegularizationLoss,
)


@torch.no_grad()
def evaluate(dataloader, module):
    """Compute relevant metrics.

    Parameters
    ----------
    dataloader : DataLoader
        Dataloader that yields batches of `x` and `y`.

    module : PonderNet
        Our pondering network.

    Returns
    -------
    metrics_single : dict
        Scalar metrics. The keys are names and the values are `torch.Tensor`.
        These metrics are computed as mean values over the entire dataset.

    metrics_per_step : dict
        Per step metrics. The keys are names and the values are `torch.Tensor`
        of shape `(max_steps,)`. These metrics are computed as mean values over
        the entire dataset.

    """
    # Imply device and dtype
    param = next(module.parameters())
    device, dtype = param.device, param.dtype

    metrics_single_ = {
        "accuracy_halted": [],
        "halting_step": [],
    }
    metrics_per_step_ = {
        "accuracy": [],
        "p": [],
    }

    for x_batch, y_true_batch in dataloader:
        x_batch = x_batch.to(device, dtype)  # (batch_size, n_elems)
        y_true_batch = y_true_batch.to(device, dtype)  # (batch_size,)

        y_pred_batch, p, halting_step = module(x_batch)
        y_halted_batch = y_pred_batch.gather(
            dim=0,
            index=halting_step[None, :] - 1,
        )[
            0
        ]  # (batch_size,)

        # Computing single metrics (mean over samples in the batch)
        accuracy_halted = (
            ((y_halted_batch > 0) == y_true_batch).to(torch.float32).mean()
        )

        metrics_single_["accuracy_halted"].append(accuracy_halted)
        metrics_single_["halting_step"].append(
            halting_step.to(torch.float).mean()
        )

        # Computing per step metrics (mean over samples in the batch)
        accuracy = (
            ((y_pred_batch > 0) == y_true_batch[None, :])
            .to(torch.float32)
            .mean(dim=1)
        )

        metrics_per_step_["accuracy"].append(accuracy)
        metrics_per_step_["p"].append(p.mean(dim=1))

    metrics_single = {
        name: torch.stack(values).mean(dim=0).cpu().numpy()
        for name, values in metrics_single_.items()
    }

    metrics_per_step = {
        name: torch.stack(values).mean(dim=0).cpu().numpy()
        for name, values in metrics_per_step_.items()
    }

    return metrics_single, metrics_per_step


def plot_distributions(target, predicted):
    """Create a barplot.

    Parameters
    ----------
    target, predicted : np.ndarray
        Arrays of shape `(max_steps,)` representing the target and predicted
        probability distributions.

    Returns
    -------
    matplotlib.Figure
    """
    support = list(range(1, len(target) + 1))

    fig, ax = plt.subplots(dpi=140)

    ax.bar(
        support,
        target,
        color="red",
        label=f"Target - Geometric({target[0].item():.2f})",
    )

    ax.bar(
        support,
        predicted,
        color="green",
        width=0.4,
        label="Predicted",
    )

    ax.set_ylim(0, 0.6)
    ax.set_xticks(support)
    ax.legend()
    ax.grid()

    return fig


def plot_accuracy(accuracy):
    """Create a barplot representing accuracy over different halting steps.

    Parameters
    ----------
    accuracy : np.array
        1D array representing accuracy if we were to take the output after
        the corresponding step.

    Returns
    -------
    matplotlib.Figure
    """
    support = list(range(1, len(accuracy) + 1))

    fig, ax = plt.subplots(dpi=140)

    ax.bar(
        support,
        accuracy,
        label="Accuracy over different steps",
    )

    ax.set_ylim(0, 1)
    ax.set_xticks(support)
    ax.legend()
    ax.grid()

    return fig


def main(argv=None):
    """CLI for training."""
    parser = ArgumentParser()

    parser.add_argument(
        "log_folder",
        type=str,
        help="Folder where tensorboard logging is saved",
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=128,
        help="Batch size",
    )
    parser.add_argument(
        "--beta",
        type=float,
        default=0.01,
        help="Regularization loss coefficient",
    )
    parser.add_argument(
        "-d",
        "--device",
        type=str,
        choices={"cpu", "cuda"},
        default="cpu",
        help="Device to use",
    )
    parser.add_argument(
        "--eval-frequency",
        type=int,
        default=10_000,
        help="Evaluation is run every `eval_frequency` steps",
    )
    parser.add_argument(
        "--lambda-p",
        type=float,
        default=0.4,
        help="True probability of success for a geometric distribution",
    )
    parser.add_argument(
        "--n-iter",
        type=int,
        default=1_000_000,
        help="Number of gradient steps",
    )
    parser.add_argument(
        "--n-elems",
        type=int,
        default=64,
        help="Number of elements",
    )
    parser.add_argument(
        "--n-hidden",
        type=int,
        default=64,
        help="Number of hidden elements in the reccurent cell",
    )
    parser.add_argument(
        "--n-nonzero",
        type=int,
        nargs=2,
        default=(None, None),
        help="Lower and upper bound on nonzero elements in the training set",
    )
    parser.add_argument(
        "--max-steps",
        type=int,
        default=20,
        help="Maximum number of pondering steps",
    )

    # Parameters
    args = parser.parse_args(argv)
    print(args)

    device = torch.device(args.device)
    dtype = torch.float32
    n_eval_samples = 1000
    batch_size_eval = 50

    if args.n_nonzero[0] is None and args.n_nonzero[1] is None:
        threshold = int(0.3 * args.n_elems)
        range_nonzero_easy = (1, threshold)
        range_nonzero_hard = (args.n_elems - threshold, args.n_elems)
    else:
        range_nonzero_easy = (1, args.n_nonzero[1])
        range_nonzero_hard = (args.n_nonzero[1] + 1, args.n_elems)

    # Tensorboard
    log_folder = pathlib.Path(args.log_folder)
    writer = SummaryWriter(log_folder)
    writer.add_text("parameters", json.dumps(vars(args)))

    # Prepare data
    dataloader_train = DataLoader(
        ParityDataset(
            n_samples=args.batch_size * args.n_iter,
            n_elems=args.n_elems,
            n_nonzero_min=args.n_nonzero[0],
            n_nonzero_max=args.n_nonzero[1],
        ),
        batch_size=args.batch_size,
    )  # consider specifying `num_workers` for speedups
    eval_dataloaders = {
        "test": DataLoader(
            ParityDataset(
                n_samples=n_eval_samples,
                n_elems=args.n_elems,
                n_nonzero_min=args.n_nonzero[0],
                n_nonzero_max=args.n_nonzero[1],
            ),
            batch_size=batch_size_eval,
        ),
        f"{range_nonzero_easy[0]}_{range_nonzero_easy[1]}": DataLoader(
            ParityDataset(
                n_samples=n_eval_samples,
                n_elems=args.n_elems,
                n_nonzero_min=range_nonzero_easy[0],
                n_nonzero_max=range_nonzero_easy[1],
            ),
            batch_size=batch_size_eval,
        ),
        f"{range_nonzero_hard[0]}_{range_nonzero_hard[1]}": DataLoader(
            ParityDataset(
                n_samples=n_eval_samples,
                n_elems=args.n_elems,
                n_nonzero_min=range_nonzero_hard[0],
                n_nonzero_max=range_nonzero_hard[1],
            ),
            batch_size=batch_size_eval,
        ),
    }

    # Model preparation
    module = PonderNet(
        n_elems=args.n_elems,
        n_hidden=args.n_hidden,
        max_steps=args.max_steps,
    )
    module = module.to(device, dtype)

    # Loss preparation
    loss_rec_inst = ReconstructionLoss(
        nn.BCEWithLogitsLoss(reduction="none")
    ).to(device, dtype)

    loss_reg_inst = RegularizationLoss(
        lambda_p=args.lambda_p,
        max_steps=args.max_steps,
    ).to(device, dtype)

    # Optimizer
    optimizer = torch.optim.Adam(
        module.parameters(),
        lr=0.0003,
    )

    # Training and evaluation loops
    iterator = tqdm(enumerate(dataloader_train), total=args.n_iter)
    for step, (x_batch, y_true_batch) in iterator:
        x_batch = x_batch.to(device, dtype)
        y_true_batch = y_true_batch.to(device, dtype)

        y_pred_batch, p, halting_step = module(x_batch)

        loss_rec = loss_rec_inst(
            p,
            y_pred_batch,
            y_true_batch,
        )

        loss_reg = loss_reg_inst(
            p,
        )

        loss_overall = loss_rec + args.beta * loss_reg

        optimizer.zero_grad()
        loss_overall.backward()
        torch.nn.utils.clip_grad_norm_(module.parameters(), 1)
        optimizer.step()

        # Logging
        writer.add_scalar("loss_rec", loss_rec, step)
        writer.add_scalar("loss_reg", loss_reg, step)
        writer.add_scalar("loss_overall", loss_overall, step)

        # Evaluation
        if step % args.eval_frequency == 0:
            module.eval()

            for dataloader_name, dataloader in eval_dataloaders.items():
                metrics_single, metrics_per_step = evaluate(
                    dataloader,
                    module,
                )
                fig_dist = plot_distributions(
                    loss_reg_inst.p_g.cpu().numpy(),
                    metrics_per_step["p"],
                )
                writer.add_figure(
                    f"distributions/{dataloader_name}", fig_dist, step
                )

                fig_acc = plot_accuracy(metrics_per_step["accuracy"])
                writer.add_figure(
                    f"accuracy_per_step/{dataloader_name}", fig_acc, step
                )

                for metric_name, metric_value in metrics_single.items():
                    writer.add_scalar(
                        f"{metric_name}/{dataloader_name}",
                        metric_value,
                        step,
                    )

            torch.save(module, log_folder / "checkpoint.pth")

            module.train()


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/pondernet/utils.py
================================================
import torch
import torch.nn as nn
from torch.utils.data import Dataset


class ParityDataset(Dataset):
    """Parity of vectors - binary classification dataset.

    Parameters
    ----------
    n_samples : int
        Number of samples to generate.

    n_elems : int
        Size of the vectors.

    n_nonzero_min, n_nonzero_max : int or None
        Minimum (inclusive) and maximum (inclusive) number of nonzero
        elements in the feature vector. If not specified then `(1, n_elem)`.
    """

    def __init__(
        self,
        n_samples,
        n_elems,
        n_nonzero_min=None,
        n_nonzero_max=None,
    ):
        self.n_samples = n_samples
        self.n_elems = n_elems

        self.n_nonzero_min = 1 if n_nonzero_min is None else n_nonzero_min
        self.n_nonzero_max = (
            n_elems if n_nonzero_max is None else n_nonzero_max
        )

        assert 0 <= self.n_nonzero_min <= self.n_nonzero_max <= n_elems

    def __len__(self):
        """Get the number of samples."""
        return self.n_samples

    def __getitem__(self, idx):
        """Get a feature vector and it's parity (target).

        Note that the generating process is random.
        """
        x = torch.zeros((self.n_elems,))
        n_non_zero = torch.randint(
            self.n_nonzero_min, self.n_nonzero_max + 1, (1,)
        ).item()
        x[:n_non_zero] = torch.randint(0, 2, (n_non_zero,)) * 2 - 1
        x = x[torch.randperm(self.n_elems)]

        y = (x == 1.0).sum() % 2

        return x, y


class PonderNet(nn.Module):
    """Network that ponders.

    Parameters
    ----------
    n_elems : int
        Number of features in the vector.

    n_hidden : int
        Hidden layer size of the recurrent cell.

    max_steps : int
        Maximum number of steps the network can "ponder" for.

    allow_halting : bool
        If True, then the forward pass is allowed to halt before
        reaching the maximum steps.

    Attributes
    ----------
    cell : nn.GRUCell
        Learnable GRU cell that maps the previous hidden state and the input
        to a new hidden state.

    output_layer : nn.Linear
        Linear module that serves as the binary classifier. It inputs
        the hidden state.

    lambda_layer : nn.Linear
        Linear module that generates the halting probability at each step.

    """

    def __init__(
        self, n_elems, n_hidden=64, max_steps=20, allow_halting=False
    ):
        super().__init__()

        self.max_steps = max_steps
        self.n_hidden = n_hidden
        self.allow_halting = allow_halting

        self.cell = nn.GRUCell(n_elems, n_hidden)
        self.output_layer = nn.Linear(n_hidden, 1)
        self.lambda_layer = nn.Linear(n_hidden, 1)

    def forward(self, x):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Batch of input features of shape `(batch_size, n_elems)`.

        Returns
        -------
        y : torch.Tensor
            Tensor of shape `(max_steps, batch_size)` representing
            the predictions for each step and each sample. In case
            `allow_halting=True` then the shape is
            `(steps, batch_size)` where `1 <= steps <= max_steps`.

        p : torch.Tensor
            Tensor of shape `(max_steps, batch_size)` representing
            the halting probabilities. Sums over rows (fixing a sample)
            are 1. In case `allow_halting=True` then the shape is
            `(steps, batch_size)` where `1 <= steps <= max_steps`.

        halting_step : torch.Tensor
            An integer for each sample in the batch that corresponds to
            the step when it was halted. The shape is `(batch_size,)`. The
            minimal value is 1 because we always run at least one step.
        """
        batch_size, _ = x.shape
        device = x.device

        h = x.new_zeros(batch_size, self.n_hidden)

        un_halted_prob = x.new_ones(batch_size)

        y_list = []
        p_list = []

        halting_step = torch.zeros(
            batch_size,
            dtype=torch.long,
            device=device,
        )

        for n in range(1, self.max_steps + 1):
            if n == self.max_steps:
                lambda_n = x.new_ones(batch_size)  # (batch_size,)
            else:
                lambda_n = torch.sigmoid(self.lambda_layer(h))[
                    :, 0
                ]  # (batch_size,)

            # Store releavant outputs
            y_list.append(self.output_layer(h)[:, 0])  # (batch_size,)
            p_list.append(un_halted_prob * lambda_n)  # (batch_size,)

            halting_step = torch.maximum(
                n
                * (halting_step == 0)
                * torch.bernoulli(lambda_n).to(torch.long),
                halting_step,
            )

            # Prepare for next iteration
            un_halted_prob = un_halted_prob * (1 - lambda_n)
            h = self.cell(x, h)

            # Potentially stop if all samples halted
            if self.allow_halting and (halting_step > 0).sum() == batch_size:
                break

        y = torch.stack(y_list)
        p = torch.stack(p_list)

        return y, p, halting_step


class ReconstructionLoss(nn.Module):
    """Weighted average of per step losses.

    Parameters
    ----------
    loss_func : callable
        Loss function that accepts `y_pred` and `y_true` as arguments. Both
        of these tensors have shape `(batch_size,)`. It outputs a loss for
        each sample in the batch.
    """

    def __init__(self, loss_func):
        super().__init__()

        self.loss_func = loss_func

    def forward(self, p, y_pred, y_true):
        """Compute loss.

        Parameters
        ----------
        p : torch.Tensor
            Probability of halting of shape `(max_steps, batch_size)`.

        y_pred : torch.Tensor
            Predicted outputs of shape `(max_steps, batch_size)`.

        y_true : torch.Tensor
            True targets of shape `(batch_size,)`.

        Returns
        -------
        loss : torch.Tensor
            Scalar representing the reconstruction loss. It is nothing else
            than a weighted sum of per step losses.
        """
        max_steps, _ = p.shape
        total_loss = p.new_tensor(0.0)

        for n in range(max_steps):
            loss_per_sample = p[n] * self.loss_func(
                y_pred[n], y_true
            )  # (batch_size,)
            total_loss = total_loss + loss_per_sample.mean()  # (1,)

        return total_loss


class RegularizationLoss(nn.Module):
    """Enforce halting distribution to ressemble the geometric distribution.

    Parameters
    ----------
    lambda_p : float
        The single parameter determining uniquely the geometric distribution.
        Note that the expected value of this distribution is going to be
        `1 / lambda_p`.

    max_steps : int
        Maximum number of pondering steps.
    """

    def __init__(self, lambda_p, max_steps=20):
        super().__init__()

        p_g = torch.zeros((max_steps,))
        not_halted = 1.0

        for k in range(max_steps):
            p_g[k] = not_halted * lambda_p
            not_halted = not_halted * (1 - lambda_p)

        self.register_buffer("p_g", p_g)
        self.kl_div = nn.KLDivLoss(reduction="batchmean")

    def forward(self, p):
        """Compute loss.

        Parameters
        ----------
        p : torch.Tensor
            Probability of halting of shape `(steps, batch_size)`.

        Returns
        -------
        loss : torch.Tensor
            Scalar representing the regularization loss.
        """
        steps, batch_size = p.shape

        p = p.transpose(0, 1)  # (batch_size, max_steps)

        p_g_batch = self.p_g[None, :steps].expand_as(
            p
        )  # (batch_size, max_steps)

        return self.kl_div(p.log(), p_g_batch)


================================================
FILE: github_adventures/product_quantization/README.md
================================================
# Installation

Run the following to get all the dependencies.
```
pip install -r requirements.txt
```

# Faiss 101
The code for the short intro to FAISS can be found in `faiss_101_ipython.py`.
Note that you can use `parse.py` to turn the raw fasttext embeddings
into a numpy array. See `run_all.sh` for example usage.

# Custom PQ implementation
The custom PQ implementation can be found inside of `custom.py`.


# End to end script
The script `run_all.sh` does the following things:

* Download fasttext embeddings
* Train multiple indexes (faiss + custom) using the embeddings
* Serve gradio apps for similarity search comparing different indexes


```
chmod +x run_all.sh
./run_all
```

Don't forget to kill the Gradio processes by `pkill -f gradio` once you
don't need them anymore.


================================================
FILE: github_adventures/product_quantization/convert.py
================================================
import argparse
import logging
import pathlib
import pickle

import faiss

from custom import CustomIndexPQ

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def from_faiss(faiss_index: faiss.swigfaiss.IndexPQ) -> CustomIndexPQ:
    if not faiss_index.is_trained:
        raise ValueError("The faiss index is not trained")

    if faiss_index.ntotal == 0:
        raise ValueError("The faiss index has no codes")

    d = faiss_index.d
    m = faiss_index.code_size
    nbits = faiss_index.pq.nbits
    k = 2**nbits
    ntotal = faiss_index.ntotal

    custom_index = CustomIndexPQ(d=d, m=m, nbits=nbits)
    centers = faiss.vector_to_array(faiss_index.pq.centroids).reshape(
        m, k, d // m
    )

    logger.info("Copying centers from the faiss index")
    for i in range(m):
        custom_index.estimators[i].cluster_centers_ = centers[i]
    custom_index.is_trained = True

    logger.info("Copying codes form the faiss index")
    custom_index.codes = faiss.vector_to_array(faiss_index.codes).reshape(
        ntotal, m
    )

    return custom_index


def main() -> int:
    parser = argparse.ArgumentParser("Convert from faiss to custom")
    parser.add_argument(
        "faiss_index_path",
        type=pathlib.Path,
        help="Path to a faiss index",
    )
    parser.add_argument(
        "output_index_path",
        type=pathlib.Path,
        help="Path to a new custom index with faiss parameters",
    )

    args = parser.parse_args()

    faiss_index = faiss.read_index(str(args.faiss_index_path))
    custom_index = from_faiss(faiss_index)

    with args.output_index_path.open("wb") as f:
        pickle.dump(custom_index, f)


if __name__ == "__main__":
    main()


================================================
FILE: github_adventures/product_quantization/custom.py
================================================
from __future__ import annotations

import logging

import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

logger = logging.getLogger(__name__)

BITS2DTYPE = {
    8: np.uint8,
}


class CustomIndexPQ:
    """Custom IndexPQ implementation.

    Parameters
    ----------
    d
        Dimensionality of the original vectors.

    m
        Number of segments.

    nbits
        Number of bits.

    estimator_kwargs
        Additional hyperparameters passed onto the sklearn KMeans
        class.

    """

    def __init__(
        self,
        d: int,
        m: int,
        nbits: int,
        **estimator_kwargs: str | int,
    ) -> None:
        if d % m != 0:
            raise ValueError("d needs to be a multiple of m")

        if nbits not in BITS2DTYPE:
            raise ValueError(f"Unsupported number of bits {nbits}")

        self.m = m
        self.k = 2**nbits
        self.d = d
        self.ds = d // m

        self.estimators = [
            KMeans(n_clusters=self.k, **estimator_kwargs) for _ in range(m)
        ]
        logger.info(f"Creating following estimators: {self.estimators[0]!r}")

        self.is_trained = False

        self.dtype = BITS2DTYPE[nbits]
        self.dtype_orig = np.float32

        self.codes: np.ndarray | None = None

    def train(self, X: np.ndarray) -> None:
        """Train all KMeans estimators.

        Parameters
        ----------
        X
            Array of shape `(n, d)` and dtype `float32`.

        """
        if self.is_trained:
            raise ValueError("Training multiple times is not allowed")

        for i in range(self.m):
            estimator = self.estimators[i]
            X_i = X[:, i * self.ds : (i + 1) * self.ds]

            logger.info(f"Fitting KMeans for the {i}-th segment")
            estimator.fit(X_i)

        self.is_trained = True


    def encode(self, X: np.ndarray) -> np.ndarray:
        """Encode original features into codes.

        Parameters
        ----------
        X
            Array of shape `(n_queries, d)` of dtype `np.float32`.

        Returns
        -------
        result
            Array of shape `(n_queries, m)` of dtype `np.uint8`.
        """
        n = len(X)
        result = np.empty((n, self.m), dtype=self.dtype)

        for i in range(self.m):
            estimator = self.estimators[i]
            X_i = X[:, i * self.ds : (i + 1) * self.ds]
            result[:, i] = estimator.predict(X_i)

        return result

    def add(self, X: np.ndarray) -> None:
        """Add vectors to the database (their encoded versions).

        Parameters
        ----------
        X
            Array of shape `(n_codes, d)` of dtype `np.float32`.
        """
        if not self.is_trained:
            raise ValueError("The quantizer needs to be trained first.")
        self.codes = self.encode(X)

    def compute_asymmetric_distances(self, X: np.ndarray) -> np.ndarray:
        """Compute asymmetric distances to all database codes.

        Parameters
        ----------
        X
            Array of shape `(n_queries, d)` of dtype `np.float32`.

        Returns
        -------
        distances
            Array of shape `(n_queries, n_codes)` of dtype `np.float32`.

        """
        if not self.is_trained:
            raise ValueError("The quantizer needs to be trained first.")

        if self.codes is None:
            raise ValueError("No codes detected. You need to run `add` first")

        n_queries = len(X)
        n_codes = len(self.codes)

        distance_table = np.empty(
            (n_queries, self.m, self.k), dtype=self.dtype_orig
        )  # (n_queries, m, k)

        for i in range(self.m):
            X_i = X[:, i * self.ds : (i + 1) * self.ds]  # (n_queries, ds)
            centers = self.estimators[i].cluster_centers_  # (k, ds)
            distance_table[:, i, :] = euclidean_distances(
                X_i, centers, squared=True
            )

        distances = np.zeros((n_queries, n_codes), dtype=self.dtype_orig)

        for i in range(self.m):
            distances += distance_table[:, i, self.codes[:, i]]

        return distances

    def search(self, X: np.ndarray, k: int) -> tuple[np.ndarray, np.ndarray]:
        """Find k closest database codes to given queries.

        Parameters
        ----------
        X
            Array of shape `(n_queries, d)` of dtype `np.float32`.

        k
            The number of closest codes to look for.

        Returns
        -------
        distances
            Array of shape `(n_queries, k)`.

        indices
            Array of shape `(n_queries, k)`.
        """
        n_queries = len(X)
        distances_all = self.compute_asymmetric_distances(X)

        indices = np.argsort(distances_all, axis=1)[:, :k]

        distances = np.empty((n_queries, k), dtype=np.float32)
        for i in range(n_queries):
            distances[i] = distances_all[i][indices[i]]

        return distances, indices


================================================
FILE: github_adventures/product_quantization/faiss_101_ipython.py
================================================
import numpy as np
import faiss

# Load fast text embeddings
embs = np.load("parsed_fasttext/embs.npy")  # change path if necessary
embs.shape
embs.nbytes / 1e6

# Prepare parameters
d = embs.shape[1]
m = 10
nbits = 8
k = 2 ** nbits
k

# Construct index
index = faiss.IndexPQ(d, m, nbits)
index.is_trained

# Try encoding without any training
index.sa_encode(embs[:2])

# Train the model
index.train(embs)
index.is_trained
index.ntotal

# Add vectors to the database
index.add(embs)
index.ntotal

codes = faiss.vector_to_array(index.codes).reshape(index.ntotal, m)
codes[:3]
codes.nbytes / 1e6

# Try searching - EXHAUSTIVE SEARCH
index.search(embs[:3], 4)

# Quickly show that with flat index distances are precise
flat_index = faiss.IndexFlatL2(d)
flat_index.train(embs)
flat_index.add(embs)
flat_index.search(embs[:3], 4)


================================================
FILE: github_adventures/product_quantization/generate_index.py
================================================
from __future__ import annotations

import argparse
import logging
import pathlib
import pickle

import faiss
import numpy as np

from custom import CustomIndexPQ

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


parser = argparse.ArgumentParser()
parser.add_argument(
    "input_path",
    type=pathlib.Path,
    help="Path to the full embeddings array",
)
parser.add_argument(
    "index_type",
    type=str,
    choices=["faiss-flat", "faiss-pq", "our-pq"],
    help="Type of index to generate",
)
parser.add_argument(
    "output_path",
    type=pathlib.Path,
    help="Path to where to store the index"
)

args, unknown_kwargs = parser.parse_known_args()
hyperparams: dict[str, int] = {}

for i in range(0, len(unknown_kwargs), 2):
    key_raw, value_raw = unknown_kwargs[i], unknown_kwargs[i + 1]

    key = key_raw.strip("--")
    value = int(value_raw) if value_raw.isnumeric() else value_raw
    hyperparams[key] = value

logger.info(f"The following hyperparameters were detected {hyperparams}")
logger.info("Loading embeddings")
embs = np.load(args.input_path)
n, d = embs.shape

if args.index_type == "faiss-flat":
    logger.info("Instantiating IndexFlatL2")
    index = faiss.IndexFlatL2(d)

elif args.index_type == "faiss-pq":
    logger.info("Instantiating IndexPQ")
    arguments = [d, hyperparams["m"], hyperparams["nbits"]]
    index = faiss.IndexPQ(*arguments)

elif args.index_type == "our-pq":
    logger.info("Instantiating CustomIndexPQ")
    index = CustomIndexPQ(d, **hyperparams)

logger.info("Training the index")
index.train(embs)

logger.info("Adding all embeddings to the index")
index.add(embs)

logger.info(f"Writing index to disk - {args.output_path}")

if args.index_type == "our-pq":
    with args.output_path.open("wb") as f:
        pickle.dump(index, f)
    
else:
    faiss.write_index(index, str(args.output_path))


================================================
FILE: github_adventures/product_quantization/parse.py
================================================
from __future__ import annotations

import argparse
import io
import logging
import pathlib
import tqdm

import numpy as np

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


def get_embeddings(path: str, maximum: int | None = None) -> tuple[list[str], np.ndarray]:
    fin = io.open(path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    n = n if maximum is None else min(n, maximum)

    embs: np.ndarray = np.empty((n, d), dtype=np.float32)
    words: list[str] = []

    for i, line in tqdm.tqdm(enumerate(fin)):
        if maximum is not None and i == maximum:
            break

        tokens = line.rstrip().split(' ')

        words.append(tokens[0])
        embs[i] = list(map(float, tokens[1:]))
    
    return words, embs

parser = argparse.ArgumentParser()
parser.add_argument(
    "fasttext_path",
    type=pathlib.Path,
    help="Path to fasttext embeddings.",
)
parser.add_argument(
    "output_dir",
    type=pathlib.Path,
    help="Directory where we store the words and the embeddings."
)
parser.add_argument(
    "-m",
    "--max",
    type=int,
    help="Maximum number of embeddings to parse."
)

args = parser.parse_args()

path_embs = args.output_dir / "embs.npy"
path_words = args.output_dir / "words.txt"

args.output_dir.mkdir(exist_ok=True, parents=True)

logger.info("Parsing")
words, embs = get_embeddings(args.fasttext_path, maximum=args.max)

logger.info("Saving words")
with path_words.open("w") as f:
    for word in words:
        f.write(word + "\n")
    
logger.info("Saving embeddings")
np.save(path_embs, embs)


================================================
FILE: github_adventures/product_quantization/requirements.txt
================================================
faiss-cpu==1.7.2
gradio==3.0.17
numpy==1.22.4
pandas==1.4.2
scikit-learn==1.1.1


================================================
FILE: github_adventures/product_quantization/run_all.sh
================================================
set -ex

# Parameters
URL=https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
RAW_FASTTEXT=raw_fasttext.vec
MAX_WORDS=100000
OUTPUT_FOLDER=new_results  # no slash
SCIKIT_KWARGS='--n_init 1 --max_iter 30 --init random'

# Download fasttext embeddings
if [ ! -f $RAW_FASTTEXT ]
then
    curl $URL --output $RAW_FASTTEXT.gz
    gzip -d $RAW_FASTTEXT.gz
fi

mkdir $OUTPUT_FOLDER

# Parse raw data
python parse.py $RAW_FASTTEXT $OUTPUT_FOLDER -m $MAX_WORDS

# Generate a couple of different indexes
python generate_index.py \
    $OUTPUT_FOLDER/embs.npy \
    faiss-flat \
    $OUTPUT_FOLDER/flat.faiss

python generate_index.py \
    $OUTPUT_FOLDER/embs.npy \
    faiss-pq \
    $OUTPUT_FOLDER/faisspq_m4_nbits8.faiss \
    --m 4 \
    --nbits 8

python generate_index.py \
    $OUTPUT_FOLDER/embs.npy \
    faiss-pq \
    $OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \
    --m 12 \
    --nbits 8

python generate_index.py \
    $OUTPUT_FOLDER/embs.npy \
    our-pq \
    $OUTPUT_FOLDER/custompq_m4_nbits8.pkl \
    --m 4 \
    --nbits 8 \
    $SCIKIT_KWARGS

python generate_index.py \
    $OUTPUT_FOLDER/embs.npy \
    our-pq \
    $OUTPUT_FOLDER/custompq_m12_nbits8.pkl \
    --m 12 \
    --nbits 8 \
    $SCIKIT_KWARGS

# Convert faiss index into custom index
python convert.py \
    $OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \
    $OUTPUT_FOLDER/converted_faisspq_m12_nbits8.pkl


# Run webapp

GRADIO_SERVER_PORT=7777 python run_gradio.py \
    $OUTPUT_FOLDER/flat.faiss \
    $OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \
    $OUTPUT_FOLDER/converted_faisspq_m12_nbits8.pkl \
    $OUTPUT_FOLDER/words.txt \
    &

GRADIO_SERVER_PORT=7778 python run_gradio.py \
    $OUTPUT_FOLDER/flat.faiss \
    $OUTPUT_FOLDER/faisspq_m4_nbits8.faiss \
    $OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \
    $OUTPUT_FOLDER/words.txt \
    &


GRADIO_SERVER_PORT=7779 python run_gradio.py \
    $OUTPUT_FOLDER/flat.faiss \
    $OUTPUT_FOLDER/custompq_m4_nbits8.pkl \
    $OUTPUT_FOLDER/custompq_m12_nbits8.pkl \
    $OUTPUT_FOLDER/words.txt \
    &
# make sure to kill the gradio processes pkill -f gradio


================================================
FILE: github_adventures/product_quantization/run_gradio.py
================================================
from __future__ import annotations

import argparse
import logging
import pathlib
import pickle
import time
from functools import partial
from typing import Any

import faiss
import gradio as gr
import numpy as np
import pandas as pd

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)


parser = argparse.ArgumentParser()
parser.add_argument(
    "exact_index_path",
    type=pathlib.Path,
    help="Path to the exact index",
)
parser.add_argument(
    "approximate_index_path",
    type=pathlib.Path,
    nargs="+",
    help="Path to the approximate index",
)
parser.add_argument(
    "words_path",
    type=pathlib.Path,
    help="Path to the text file containing words",
)

args = parser.parse_args()

def run(
    word: str,
    k: int,
    exact_index,
    approximate_indexes: dict[str, Any],
    words: list[str],
    word2ix: dict[str, int],
) -> tuple[pd.DataFrame, pd.DataFrame, dict[str, float]]:
    metrics = {}

    emb = exact_index.reconstruct(word2ix[word])

    start = time.monotonic()
    D, I = exact_index.search(emb[None, :], k)
    metrics["time_exact"] = time.monotonic() - start
    D, I = D[0], I[0]

    df_e = pd.DataFrame({
        "ix": I,
        "distance": D,
        "word": [words[i] for i in I],
    })
    dfs_a = []

    for name, approximate_index in approximate_indexes.items():
        start = time.monotonic()
        D, I = approximate_index.search(emb[None, :], k)
        metrics[f"time_approximate_{name}"] = time.monotonic() - start
        D, I = D[0], I[0]

        df_a = pd.DataFrame({
            "ix": I,
            "distance": D,
            "word": [words[i] for i in I],
        })
        dfs_a.append(df_a)

        metrics[f"recall_{name}"] = len(np.intersect1d(df_e.word.unique(), df_a.word.unique())) / k

    return df_e, *dfs_a, metrics


logger.info(f"Loading words {args.words_path}")
words = args.words_path.read_text().strip().split("\n")
word2ix = {word: i for i, word in enumerate(words)}

logger.info(f"Loading exact index {args.exact_index_path}")
exact_index = faiss.read_index(str(args.exact_index_path))

logger.info(f"Loading approximate indexes {args.approximate_index_path}")

approximate_indexes = {
}

for path in args.approximate_index_path:
    if path.suffix in {".pkl", "pickle"}:
        with path.open("rb") as f:
            approximate_indexes[path.stem] = pickle.load(f)

    else:
        approximate_indexes[path.stem] = faiss.read_index(str(path))

# Sanity checks
assert isinstance(exact_index, faiss.IndexFlat)
# assert len(words) == exact_index.ntotal == approximate_index.ntotal

run_partial = partial(
    run,
    exact_index=exact_index,
    approximate_indexes=approximate_indexes,
    words=words,
    word2ix=word2ix,
)

setattr(run_partial, "__name__", "run_function")

demo = gr.Interface(
    fn=run_partial,
    inputs=[
        gr.Textbox(lines=1, placeholder="Word here..."),
        gr.Slider(minimum=1, maximum=20, value=5, step=1),
    ],
    outputs=[
        gr.DataFrame(label="exact"),
        *[gr.DataFrame(label=name) for name in approximate_indexes.keys()],
        gr.JSON(label="metrics"),
        ],
    allow_flagging="never",

)

demo.launch()


================================================
FILE: github_adventures/siren/activations.py
================================================
import pathlib
from functools import partial

import torch
from torch.utils.tensorboard import SummaryWriter

from core import ImageSiren

torch.manual_seed(2)

init_functions = {
        "ones": torch.nn.init.ones_,
        "eye": torch.nn.init.eye_,
        "default": partial(torch.nn.init.kaiming_uniform_, a=5 ** (1 / 2)),
        "paper": None,
}

for fname, func in init_functions.items():
    path = pathlib.Path.cwd() / "tensorboard_logs" / fname
    writer = SummaryWriter(path)

    def fh(inst, inp, out, number=0):
        layer_name = f"{number}_{inst.__class__.__name__}"
        writer.add_histogram(layer_name, out)

    model = ImageSiren(
            hidden_layers=10,
            hidden_features=200,
            first_omega=30,
            hidden_omega=30,
            custom_init_function_=func,
    )

    for i, layer in enumerate(model.net.modules()):
        if not i:
            continue
        layer.register_forward_hook(partial(fh, number=(i + 1) // 2))

    inp = 2 * (torch.rand(10000, 2) - 0.5)
    writer.add_histogram("0", inp)
    res = model(inp)


================================================
FILE: github_adventures/siren/core.py
================================================
import numpy as np
import torch
import torch.nn as nn
from scipy.ndimage import laplace, sobel
from torch.utils.data import Dataset


def paper_init_(weight, is_first=False, omega=1):
    """Initialize the weigth of the Linear layer.

    Parameters
    ----------
    weight : torch.Tensor
        The learnable 2D weight matrix.

    is_first : bool
        If True, this Linear layer is the very first one in the network.

    omega : float
        Hyperparamter.
    """
    in_features = weight.shape[1]

    with torch.no_grad():
        if is_first:
            bound = 1 / in_features
        else:
            bound = np.sqrt(6 / in_features) / omega

        weight.uniform_(-bound, bound)


class SineLayer(nn.Module):
    """Linear layer followed by the sine activation.

    Parameters
    ----------
    in_features : int
        Number of input features.

    out_features : int
        Number of output features.

    bias : bool
        If True, the bias is included.

    is_first : bool
        If True, then it represents the first layer of the network. Note that
        it influences the initialization scheme.

    omega : int
        Hyperparameter. Determines scaling.

    custom_init_function_ : None or callable
        If None, then we are going to use the `paper_init_` defined above.
        Otherwise, any callable that modifies the `weight` parameter in place.

    Attributes
    ----------
    linear : nn.Linear
        Linear layer.
    """
    def __init__(
            self,
            in_features,
            out_features,
            bias=True,
            is_first=False,
            omega=30,
            custom_init_function_=None,
    ):
        super().__init__()
        self.omega = omega
        self.linear = nn.Linear(in_features, out_features, bias=bias)

        if custom_init_function_ is None:
            paper_init_(self.linear.weight, is_first=is_first, omega=omega)
        else:
            custom_init_function_(self.linear.weight)

    def forward(self, x):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Tensor of shape `(n_samples, in_features)`.

        Returns
        -------
        torch.Tensor
            Tensor of shape `(n_samples, out_features).
        """
        return torch.sin(self.omega * self.linear(x))

class ImageSiren(nn.Module):
    """Network composed of SineLayers.

    Parameters
    ----------
    hidden_features : int
        Number of hidden features (each hidden layer the same).

    hidden_layers : int
        Number of hidden layers.

    first_omega, hidden_omega : float
        Hyperparameter influencing scaling.

    custom_init_function_ : None or callable
        If None, then we are going to use the `paper_init_` defined above.
        Otherwise any callable that modifies the `weight` parameter in place.

    Attributes
    ----------
    net : nn.Sequential
        Sequential collection of `SineLayer` and `nn.Linear` at the end.
    """
    def __init__(
            self,
            hidden_features,
            hidden_layers=1,
            first_omega=30,
            hidden_omega=30,
            custom_init_function_=None,
            ):
        super().__init__()
        in_features = 2
        out_features = 1

        net = []
        net.append(
                SineLayer(
                    in_features,
                    hidden_features,
                    is_first=True,
                    custom_init_function_=custom_init_function_,
                    omega=first_omega,
            )
        )

        for _ in range(hidden_layers):
            net.append(
                    SineLayer(
                        hidden_features,
                        hidden_features,
                        is_first=False,
                        custom_init_function_=custom_init_function_,
                        omega=hidden_omega,
                )
            )

        final_linear = nn.Linear(hidden_features, out_features)
        if custom_init_function_ is None:
            paper_init_(final_linear.weight, is_first=False, omega=hidden_omega)
        else:
            custom_init_function_(final_linear.weight)

        net.append(final_linear)
        self.net = nn.Sequential(*net)


    def forward(self, x):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Tensor of shape `(n_samples, 2)` representing the 2D pixel coordinates.

        Returns
        -------
        torch.Tensor
            Tensor of shape `(n_samples, 1)` representing the predicted
            intensities.
        """
        return self.net(x)


def generate_coordinates(n):
    """Generate regular grid of 2D coordinates on [0, n] x [0, n].

    Parameters
    ----------
    n : int
        Number of points per dimension.

    Returns
    -------
    coords_abs : np.ndarray
        Array of row and column coordinates of shape `(n ** 2, 2)`.
    """
    rows, cols = np.meshgrid(range(n), range(n), indexing="ij")
    coords_abs = np.stack([rows.ravel(), cols.ravel()], axis=-1)

    return coords_abs

class PixelDataset(Dataset):
    """Dataset yielding coordinates, intensitives and (higher) derivatives.

    Parameters
    ----------
    img : np.ndarray
        2D image representing a grayscale image.

    Attributes
    ----------
    size : int
        Height and width of the square image.

    coords_abs : np.ndarray
        Array of shape `(size ** 2, 2)` representing all coordinates of the
        `img`.

    grad : np.ndarray
        Array of shape `(size, size, 2)` representing the approximate
        gradient in the two directions.

    grad_norm : np.ndarray
        Array of shape `(size, size)` representing the approximate gradient
        norm of `img`.

    laplace : np.ndarray
        Array of shape `(size, size)` representing the approximate laplace operator.
    """
    def __init__(self, img):
        if not (img.ndim == 2 and img.shape[0] == img.shape[1]):
            raise ValueError("Only 2D square images are supported.")

        self.img = img
        self.size = img.shape[0]
        self.coords_abs = generate_coordinates(self.size)
        self.grad = np.stack([sobel(img, axis=0), sobel(img, axis=1)], axis=-1)
        self.grad_norm = np.linalg.norm(self.grad, axis=-1)
        self.laplace = laplace(img)

    def __len__(self):
        """Determine the number of samples (pixels)."""
        return self.size ** 2

    def __getitem__(self, idx):
        """Get all relevant data for a single coordinate."""
        coords_abs = self.coords_abs[idx]
        r, c = coords_abs

        coords = 2 * ((coords_abs / self.size) - 0.5)

        return {
            "coords": coords,
            "coords_abs": coords_abs,
            "intensity": self.img[r, c],
            "grad_norm": self.grad_norm[r, c],
            "grad": self.grad[r, c],
            "laplace": self.laplace[r, c],
        }


class GradientUtils:
    @staticmethod
    def gradient(target, coords):
        """Compute the gradient with respect to input.

        Parameters
        ----------
        target : torch.Tensor
            2D tensor of shape `(n_coords, ?)` representing the targets.

        coords : torch.Tensor
            2D tensor fo shape `(n_coords, 2)` representing the coordinates.

        Returns
        -------
        grad : torch.Tensor
            2D tensor of shape `(n_coords, 2)` representing the gradient.
        """
        return torch.autograd.grad(
            target, coords, grad_outputs=torch.ones_like(target), create_graph=True
        )[0]

    @staticmethod
    def divergence(grad, coords):
        """Compute divergence.

        Parameters
        ----------
        grad : torch.Tensor
            2D tensor of shape `(n_coords, 2)` representing the gradient wrt
            x and y.

        coords : torch.Tensor
            2D tensor of shape `(n_coords, 2)` representing the coordinates.

        Returns
        -------
        div : torch.Tensor
            2D tensor of shape `(n_coords, 1)` representing the divergence.

        Notes
        -----
        In a 2D case this will give us f_{xx} + f_{yy}.
        """
        div = 0.0
        for i in range(coords.shape[1]):
            div += torch.autograd.grad(
                grad[..., i], coords, torch.ones_like(grad[..., i]), create_graph=True,
            )[0][..., i : i + 1]
        return div

    @staticmethod
    def laplace(target, coords):
        """Compute laplace operator.

        Parameters
        ----------
        target : torch.Tensor
            2D tesnor of shape `(n_coords, 1)` representing the targets.

        coords : torch.Tensor
            2D tensor of shape `(n_coords, 2)` representing the coordinates.

        Returns
        -------
        torch.Tensor
            2D tensor of shape `(n_coords, 1)` representing the laplace.
        """
        grad = GradientUtils.gradient(target, coords)
        return GradientUtils.divergence(grad, coords)


================================================
FILE: github_adventures/siren/train.py
================================================
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.nn import Linear, ReLU, Sequential
from torch.utils.data import DataLoader
import tqdm

from core import GradientUtils, ImageSiren, PixelDataset


# Image loading
img_ = plt.imread("dog.png")
downsampling_factor = 4
img = 2 * (img_ - 0.5)
img = img[::downsampling_factor, ::downsampling_factor]
size = img.shape[0]

dataset = PixelDataset(img)

# Parameters
n_epochs = 100
batch_size = int(size ** 2)
logging_freq = 20

model_name = "siren"  # "siren", "mlp_relu"
hidden_features = 256
hidden_layers = 3

target = "intensity"  # "intensity", "grad", "laplace"


# Model creation
if model_name == "siren":
    model = ImageSiren(
        hidden_features,
        hidden_layers=hidden_layers,
        hidden_omega=30,
    )
elif model_name == "mlp_relu":
    layers = [Linear(2, hidden_features), ReLU()]

    for _ in range(hidden_layers):
        layers.append(Linear(hidden_features, hidden_features))
        layers.append(ReLU())

    layers.append(Linear(hidden_features, 1))

    model = Sequential(*layers)

    for module in model.modules():
        if not isinstance(module, Linear):
            continue
        torch.nn.init.xavier_normal_(module.weight)
else:
    raise ValueError("Unsupported model")

dataloader = DataLoader(dataset, batch_size=batch_size)
optim = torch.optim.Adam(lr=1e-4, params=model.parameters())

# Training loop
for e in range(n_epochs):
    losses = []
    for d_batch in tqdm.tqdm(dataloader):
        x_batch = d_batch["coords"].to(torch.float32)
        x_batch.requires_grad = True

        y_true_batch = d_batch["intensity"].to(torch.float32)
        y_true_batch = y_true_batch[:, None]

        y_pred_batch = model(x_batch)

        if target == "intensity":
            loss = ((y_true_batch - y_pred_batch) ** 2).mean()

        elif target == "grad":
            y_pred_g_batch = GradientUtils.gradient(y_pred_batch, x_batch)
            y_true_g_batch = d_batch["grad"].to(torch.float32)
            loss = ((y_true_g_batch - y_pred_g_batch) ** 2).mean()

        elif target == "laplace":
            y_pred_l_batch = GradientUtils.laplace(y_pred_batch, x_batch)
            y_true_l_batch = d_batch["laplace"].to(torch.float32)[:, None]
            loss = ((y_true_l_batch - y_pred_l_batch) ** 2).mean()

        else:
            raise ValueError("Unrecognized target")

        losses.append(loss.item())


        optim.zero_grad()
        loss.backward()
        optim.step()

    print(e, np.mean(losses))

    if e % logging_freq == 0:
        pred_img = np.zeros_like(img)
        pred_img_grad_norm = np.zeros_like(img)
        pred_img_laplace = np.zeros_like(img)

        orig_img = np.zeros_like(img)
        for d_batch in tqdm.tqdm(dataloader):
            coords = d_batch["coords"].to(torch.float32)
            coords.requires_grad = True
            coords_abs = d_batch["coords_abs"].numpy()

            pred = model(coords)
            pred_n = pred.detach().numpy().squeeze()
            pred_g = (
                GradientUtils.gradient(pred, coords)
                .norm(dim=-1)
                .detach()
                .numpy()
                .squeeze()
            )
            pred_l = GradientUtils.laplace(pred, coords).detach().numpy().squeeze()

            pred_img[coords_abs[:, 0], coords_abs[:, 1]] = pred_n
            pred_img_grad_norm[coords_abs[:, 0], coords_abs[:, 1]] = pred_g
            pred_img_laplace[coords_abs[:, 0], coords_abs[:, 1]] = pred_l

        fig, axs = plt.subplots(3, 2, constrained_layout=True)
        axs[0, 0].imshow(dataset.img, cmap="gray")
        axs[0, 1].imshow(pred_img, cmap="gray")

        axs[1, 0].imshow(dataset.grad_norm, cmap="gray")
        axs[1, 1].imshow(pred_img_grad_norm, cmap="gray")

        axs[2, 0].imshow(dataset.laplace, cmap="gray")
        axs[2, 1].imshow(pred_img_laplace, cmap="gray")

        for row in axs:
            for ax in row:
                ax.set_axis_off()

        fig.suptitle(f"Iteration: {e}")
        axs[0, 0].set_title("Ground truth")
        axs[0, 1].set_title("Prediction")

        plt.savefig(f"visualization/{e}.png")


================================================
FILE: github_adventures/vision_transformer/classes.txt
================================================
tench, Tinca_tinca
goldfish, Carassius_auratus
great_white_shark, white_shark, man-eater, man-eating_shark, Carcharodon_carcharias
tiger_shark, Galeocerdo_cuvieri
hammerhead, hammerhead_shark
electric_ray, crampfish, numbfish, torpedo
stingray
cock
hen
ostrich, Struthio_camelus
brambling, Fringilla_montifringilla
goldfinch, Carduelis_carduelis
house_finch, linnet, Carpodacus_mexicanus
junco, snowbird
indigo_bunting, indigo_finch, indigo_bird, Passerina_cyanea
robin, American_robin, Turdus_migratorius
bulbul
jay
magpie
chickadee
water_ouzel, dipper
kite
bald_eagle, American_eagle, Haliaeetus_leucocephalus
vulture
great_grey_owl, great_gray_owl, Strix_nebulosa
European_fire_salamander, Salamandra_salamandra
common_newt, Triturus_vulgaris
eft
spotted_salamander, Ambystoma_maculatum
axolotl, mud_puppy, Ambystoma_mexicanum
bullfrog, Rana_catesbeiana
tree_frog, tree-frog
tailed_frog, bell_toad, ribbed_toad, tailed_toad, Ascaphus_trui
loggerhead, loggerhead_turtle, Caretta_caretta
leatherback_turtle, leatherback, leathery_turtle, Dermochelys_coriacea
mud_turtle
terrapin
box_turtle, box_tortoise
banded_gecko
common_iguana, iguana, Iguana_iguana
American_chameleon, anole, Anolis_carolinensis
whiptail, whiptail_lizard
agama
frilled_lizard, Chlamydosaurus_kingi
alligator_lizard
Gila_monster, Heloderma_suspectum
green_lizard, Lacerta_viridis
African_chameleon, Chamaeleo_chamaeleon
Komodo_dragon, Komodo_lizard, dragon_lizard, giant_lizard, Varanus_komodoensis
African_crocodile, Nile_crocodile, Crocodylus_niloticus
American_alligator, Alligator_mississipiensis
triceratops
thunder_snake, worm_snake, Carphophis_amoenus
ringneck_snake, ring-necked_snake, ring_snake
hognose_snake, puff_adder, sand_viper
green_snake, grass_snake
king_snake, kingsnake
garter_snake, grass_snake
water_snake
vine_snake
night_snake, Hypsiglena_torquata
boa_constrictor, Constrictor_constrictor
rock_python, rock_snake, Python_sebae
Indian_cobra, Naja_naja
green_mamba
sea_snake
horned_viper, cerastes, sand_viper, horned_asp, Cerastes_cornutus
diamondback, diamondback_rattlesnake, Crotalus_adamanteus
sidewinder, horned_rattlesnake, Crotalus_cerastes
trilobite
harvestman, daddy_longlegs, Phalangium_opilio
scorpion
black_and_gold_garden_spider, Argiope_aurantia
barn_spider, Araneus_cavaticus
garden_spider, Aranea_diademata
black_widow, Latrodectus_mactans
tarantula
wolf_spider, hunting_spider
tick
centipede
black_grouse
ptarmigan
ruffed_grouse, partridge, Bonasa_umbellus
prairie_chicken, prairie_grouse, prairie_fowl
peacock
quail
partridge
African_grey, African_gray, Psittacus_erithacus
macaw
sulphur-crested_cockatoo, Kakatoe_galerita, Cacatua_galerita
lorikeet
coucal
bee_eater
hornbill
hummingbird
jacamar
toucan
drake
red-breasted_merganser, Mergus_serrator
goose
black_swan, Cygnus_atratus
tusker
echidna, spiny_anteater, anteater
platypus, duckbill, duckbilled_platypus, duck-billed_platypus, Ornithorhynchus_anatinus
wallaby, brush_kangaroo
koala, koala_bear, kangaroo_bear, native_bear, Phascolarctos_cinereus
wombat
jellyfish
sea_anemone, anemone
brain_coral
flatworm, platyhelminth
nematode, nematode_worm, roundworm
conch
snail
slug
sea_slug, nudibranch
chiton, coat-of-mail_shell, sea_cradle, polyplacophore
chambered_nautilus, pearly_nautilus, nautilus
Dungeness_crab, Cancer_magister
rock_crab, Cancer_irroratus
fiddler_crab
king_crab, Alaska_crab, Alaskan_king_crab, Alaska_king_crab, Paralithodes_camtschatica
American_lobster, Northern_lobster, Maine_lobster, Homarus_americanus
spiny_lobster, langouste, rock_lobster, crawfish, crayfish, sea_crawfish
crayfish, crawfish, crawdad, crawdaddy
hermit_crab
isopod
white_stork, Ciconia_ciconia
black_stork, Ciconia_nigra
spoonbill
flamingo
little_blue_heron, Egretta_caerulea
American_egret, great_white_heron, Egretta_albus
bittern
crane
limpkin, Aramus_pictus
European_gallinule, Porphyrio_porphyrio
American_coot, marsh_hen, mud_hen, water_hen, Fulica_americana
bustard
ruddy_turnstone, Arenaria_interpres
red-backed_sandpiper, dunlin, Erolia_alpina
redshank, Tringa_totanus
dowitcher
oystercatcher, oyster_catcher
pelican
king_penguin, Aptenodytes_patagonica
albatross, mollymawk
grey_whale, gray_whale, devilfish, Eschrichtius_gibbosus, Eschrichtius_robustus
killer_whale, killer, orca, grampus, sea_wolf, Orcinus_orca
dugong, Dugong_dugon
sea_lion
Chihuahua
Japanese_spaniel
Maltese_dog, Maltese_terrier, Maltese
Pekinese, Pekingese, Peke
Shih-Tzu
Blenheim_spaniel
papillon
toy_terrier
Rhodesian_ridgeback
Afghan_hound, Afghan
basset, basset_hound
beagle
bloodhound, sleuthhound
bluetick
black-and-tan_coonhound
Walker_hound, Walker_foxhound
English_foxhound
redbone
borzoi, Russian_wolfhound
Irish_wolfhound
Italian_greyhound
whippet
Ibizan_hound, Ibizan_Podenco
Norwegian_elkhound, elkhound
otterhound, otter_hound
Saluki, gazelle_hound
Scottish_deerhound, deerhound
Weimaraner
Staffordshire_bullterrier, Staffordshire_bull_terrier
American_Staffordshire_terrier, Staffordshire_terrier, American_pit_bull_terrier, pit_bull_terrier
Bedlington_terrier
Border_terrier
Kerry_blue_terrier
Irish_terrier
Norfolk_terrier
Norwich_terrier
Yorkshire_terrier
wire-haired_fox_terrier
Lakeland_terrier
Sealyham_terrier, Sealyham
Airedale, Airedale_terrier
cairn, cairn_terrier
Australian_terrier
Dandie_Dinmont, Dandie_Dinmont_terrier
Boston_bull, Boston_terrier
miniature_schnauzer
giant_schnauzer
standard_schnauzer
Scotch_terrier, Scottish_terrier, Scottie
Tibetan_terrier, chrysanthemum_dog
silky_terrier, Sydney_silky
soft-coated_wheaten_terrier
West_Highland_white_terrier
Lhasa, Lhasa_apso
flat-coated_retriever
curly-coated_retriever
golden_retriever
Labrador_retriever
Chesapeake_Bay_retriever
German_short-haired_pointer
vizsla, Hungarian_pointer
English_setter
Irish_setter, red_setter
Gordon_setter
Brittany_spaniel
clumber, clumber_spaniel
English_springer, English_springer_spaniel
Welsh_springer_spaniel
cocker_spaniel, English_cocker_spaniel, cocker
Sussex_spaniel
Irish_water_spaniel
kuvasz
schipperke
groenendael
malinois
briard
kelpie
komondor
Old_English_sheepdog, bobtail
Shetland_sheepdog, Shetland_sheep_dog, Shetland
collie
Border_collie
Bouvier_des_Flandres, Bouviers_des_Flandres
Rottweiler
German_shepherd, German_shepherd_dog, German_police_dog, alsatian
Doberman, Doberman_pinscher
miniature_pinscher
Greater_Swiss_Mountain_dog
Bernese_mountain_dog
Appenzeller
EntleBucher
boxer
bull_mastiff
Tibetan_mastiff
French_bulldog
Great_Dane
Saint_Bernard, St_Bernard
Eskimo_dog, husky
malamute, malemute, Alaskan_malamute
Siberian_husky
dalmatian, coach_dog, carriage_dog
affenpinscher, monkey_pinscher, monkey_dog
basenji
pug, pug-dog
Leonberg
Newfoundland, Newfoundland_dog
Great_Pyrenees
Samoyed, Samoyede
Pomeranian
chow, chow_chow
keeshond
Brabancon_griffon
Pembroke, Pembroke_Welsh_corgi
Cardigan, Cardigan_Welsh_corgi
toy_poodle
miniature_poodle
standard_poodle
Mexican_hairless
timber_wolf, grey_wolf, gray_wolf, Canis_lupus
white_wolf, Arctic_wolf, Canis_lupus_tundrarum
red_wolf, maned_wolf, Canis_rufus, Canis_niger
coyote, prairie_wolf, brush_wolf, Canis_latrans
dingo, warrigal, warragal, Canis_dingo
dhole, Cuon_alpinus
African_hunting_dog, hyena_dog, Cape_hunting_dog, Lycaon_pictus
hyena, hyaena
red_fox, Vulpes_vulpes
kit_fox, Vulpes_macrotis
Arctic_fox, white_fox, Alopex_lagopus
grey_fox, gray_fox, Urocyon_cinereoargenteus
tabby, tabby_cat
tiger_cat
Persian_cat
Siamese_cat, Siamese
Egyptian_cat
cougar, puma, catamount, mountain_lion, painter, panther, Felis_concolor
lynx, catamount
leopard, Panthera_pardus
snow_leopard, ounce, Panthera_uncia
jaguar, panther, Panthera_onca, Felis_onca
lion, king_of_beasts, Panthera_leo
tiger, Panthera_tigris
cheetah, chetah, Acinonyx_jubatus
brown_bear, bruin, Ursus_arctos
American_black_bear, black_bear, Ursus_americanus, Euarctos_americanus
ice_bear, polar_bear, Ursus_Maritimus, Thalarctos_maritimus
sloth_bear, Melursus_ursinus, Ursus_ursinus
mongoose
meerkat, mierkat
tiger_beetle
ladybug, ladybeetle, lady_beetle, ladybird, ladybird_beetle
ground_beetle, carabid_beetle
long-horned_beetle, longicorn, longicorn_beetle
leaf_beetle, chrysomelid
dung_beetle
rhinoceros_beetle
weevil
fly
bee
ant, emmet, pismire
grasshopper, hopper
cricket
walking_stick, walkingstick, stick_insect
cockroach, roach
mantis, mantid
cicada, cicala
leafhopper
lacewing, lacewing_fly
dragonfly, darning_needle, devil's_darning_needle, sewing_needle, snake_feeder, snake_doctor, mosquito_hawk, skeeter_hawk
damselfly
admiral
ringlet, ringlet_butterfly
monarch, monarch_butterfly, milkweed_butterfly, Danaus_plexippus
cabbage_butterfly
sulphur_butterfly, sulfur_butterfly
lycaenid, lycaenid_butterfly
starfish, sea_star
sea_urchin
sea_cucumber, holothurian
wood_rabbit, cottontail, cottontail_rabbit
hare
Angora, Angora_rabbit
hamster
porcupine, hedgehog
fox_squirrel, eastern_fox_squirrel, Sciurus_niger
marmot
beaver
guinea_pig, Cavia_cobaya
sorrel
zebra
hog, pig, grunter, squealer, Sus_scrofa
wild_boar, boar, Sus_scrofa
warthog
hippopotamus, hippo, river_horse, Hippopotamus_amphibius
ox
water_buffalo, water_ox, Asiatic_buffalo, Bubalus_bubalis
bison
ram, tup
bighorn, bighorn_sheep, cimarron, Rocky_Mountain_bighorn, Rocky_Mountain_sheep, Ovis_canadensis
ibex, Capra_ibex
hartebeest
impala, Aepyceros_melampus
gazelle
Arabian_camel, dromedary, Camelus_dromedarius
llama
weasel
mink
polecat, fitch, foulmart, foumart, Mustela_putorius
black-footed_ferret, ferret, Mustela_nigripes
otter
skunk, polecat, wood_pussy
badger
armadillo
three-toed_sloth, ai, Bradypus_tridactylus
orangutan, orang, orangutang, Pongo_pygmaeus
gorilla, Gorilla_gorilla
chimpanzee, chimp, Pan_troglodytes
gibbon, Hylobates_lar
siamang, Hylobates_syndactylus, Symphalangus_syndactylus
guenon, guenon_monkey
patas, hussar_monkey, Erythrocebus_patas
baboon
macaque
langur
colobus, colobus_monkey
proboscis_monkey, Nasalis_larvatus
marmoset
capuchin, ringtail, Cebus_capucinus
howler_monkey, howler
titi, titi_monkey
spider_monkey, Ateles_geoffroyi
squirrel_monkey, Saimiri_sciureus
Madagascar_cat, ring-tailed_lemur, Lemur_catta
indri, indris, Indri_indri, Indri_brevicaudatus
Indian_elephant, Elephas_maximus
African_elephant, Loxodonta_africana
lesser_panda, red_panda, panda, bear_cat, cat_bear, Ailurus_fulgens
giant_panda, panda, panda_bear, coon_bear, Ailuropoda_melanoleuca
barracouta, snoek
eel
coho, cohoe, coho_salmon, blue_jack, silver_salmon, Oncorhynchus_kisutch
rock_beauty, Holocanthus_tricolor
anemone_fish
sturgeon
gar, garfish, garpike, billfish, Lepisosteus_osseus
lionfish
puffer, pufferfish, blowfish, globefish
abacus
abaya
academic_gown, academic_robe, judge's_robe
accordion, piano_accordion, squeeze_box
acoustic_guitar
aircraft_carrier, carrier, flattop, attack_aircraft_carrier
airliner
airship, dirigible
altar
ambulance
amphibian, amphibious_vehicle
analog_clock
apiary, bee_house
apron
ashcan, trash_can, garbage_can, wastebin, ash_bin, ash-bin, ashbin, dustbin, trash_barrel, trash_bin
assault_rifle, assault_gun
backpack, back_pack, knapsack, packsack, rucksack, haversack
bakery, bakeshop, bakehouse
balance_beam, beam
balloon
ballpoint, ballpoint_pen, ballpen, Biro
Band_Aid
banjo
bannister, banister, balustrade, balusters, handrail
barbell
barber_chair
barbershop
barn
barometer
barrel, cask
barrow, garden_cart, lawn_cart, wheelbarrow
baseball
basketball
bassinet
bassoon
bathing_cap, swimming_cap
bath_towel
bathtub, bathing_tub, bath, tub
beach_wagon, station_wagon, wagon, estate_car, beach_waggon, station_waggon, waggon
beacon, lighthouse, beacon_light, pharos
beaker
bearskin, busby, shako
beer_bottle
beer_glass
bell_cote, bell_cot
bib
bicycle-built-for-two, tandem_bicycle, tandem
bikini, two-piece
binder, ring-binder
binoculars, field_glasses, opera_glasses
birdhouse
boathouse
bobsled, bobsleigh, bob
bolo_tie, bolo, bola_tie, bola
bonnet, poke_bonnet
bookcase
bookshop, bookstore, bookstall
bottlecap
bow
bow_tie, bow-tie, bowtie
brass, memorial_tablet, plaque
brassiere, bra, bandeau
breakwater, groin, groyne, mole, bulwark, seawall, jetty
breastplate, aegis, egis
broom
bucket, pail
buckle
bulletproof_vest
bullet_train, bullet
butcher_shop, meat_market
cab, hack, taxi, taxicab
caldron, cauldron
candle, taper, wax_light
cannon
canoe
can_opener, tin_opener
cardigan
car_mirror
carousel, carrousel, merry-go-round, roundabout, whirligig
carpenter's_kit, tool_kit
carton
car_wheel
cash_machine, cash_dispenser, automated_teller_machine, automatic_teller_machine, automated_teller, automatic_teller, ATM
cassette
cassette_player
castle
catamaran
CD_player
cello, violoncello
cellular_telephone, cellular_phone, cellphone, cell, mobile_phone
chain
chainlink_fence
chain_mail, ring_mail, mail, chain_armor, chain_armour, ring_armor, ring_armour
chain_saw, chainsaw
chest
chiffonier, commode
chime, bell, gong
china_cabinet, china_closet
Christmas_stocking
church, church_building
cinema, movie_theater, movie_theatre, movie_house, picture_palace
cleaver, meat_cleaver, chopper
cliff_dwelling
cloak
clog, geta, patten, sabot
cocktail_shaker
coffee_mug
coffeepot
coil, spiral, volute, whorl, helix
combination_lock
computer_keyboard, keypad
confectionery, confectionary, candy_store
container_ship, containership, container_vessel
convertible
corkscrew, bottle_screw
cornet, horn, trumpet, trump
cowboy_boot
cowboy_hat, ten-gallon_hat
cradle
crane
crash_helmet
crate
crib, cot
Crock_Pot
croquet_ball
crutch
cuirass
dam, dike, dyke
desk
desktop_computer
dial_telephone, dial_phone
diaper, nappy, napkin
digital_clock
digital_watch
dining_table, board
dishrag, dishcloth
dishwasher, dish_washer, dishwashing_machine
disk_brake, disc_brake
dock, dockage, docking_facility
dogsled, dog_sled, dog_sleigh
dome
doormat, welcome_mat
drilling_platform, offshore_rig
drum, membranophone, tympan
drumstick
dumbbell
Dutch_oven
electric_fan, blower
electric_guitar
electric_locomotive
entertainment_center
envelope
espresso_maker
face_powder
feather_boa, boa
file, file_cabinet, filing_cabinet
fireboat
fire_engine, fire_truck
fire_screen, fireguard
flagpole, flagstaff
flute, transverse_flute
folding_chair
football_helmet
forklift
fountain
fountain_pen
four-poster
freight_car
French_horn, horn
frying_pan, frypan, skillet
fur_coat
garbage_truck, dustcart
gasmask, respirator, gas_helmet
gas_pump, gasoline_pump, petrol_pump, island_dispenser
goblet
go-kart
golf_ball
golfcart, golf_cart
gondola
gong, tam-tam
gown
grand_piano, grand
greenhouse, nursery, glasshouse
grille, radiator_grille
grocery_store, grocery, food_market, market
guillotine
hair_slide
hair_spray
half_track
hammer
hamper
hand_blower, blow_dryer, blow_drier, hair_dryer, hair_drier
hand-held_computer, hand-held_microcomputer
handkerchief, hankie, hanky, hankey
hard_disc, hard_disk, fixed_disk
harmonica, mouth_organ, harp, mouth_harp
harp
harvester, reaper
hatchet
holster
home_theater, home_theatre
honeycomb
hook, claw
hoopskirt, crinoline
horizontal_bar, high_bar
horse_cart, horse-cart
hourglass
iPod
iron, smoothing_iron
jack-o'-lantern
jean, blue_jean, denim
jeep, landrover
jersey, T-shirt, tee_shirt
jigsaw_puzzle
jinrikisha, ricksha, rickshaw
joystick
kimono
knee_pad
knot
lab_coat, laboratory_coat
ladle
lampshade, lamp_shade
laptop, laptop_computer
lawn_mower, mower
lens_cap, lens_cover
letter_opener, paper_knife, paperknife
library
lifeboat
lighter, light, igniter, ignitor
limousine, limo
liner, ocean_liner
lipstick, lip_rouge
Loafer
lotion
loudspeaker, speaker, speaker_unit, loudspeaker_system, speaker_system
loupe, jeweler's_loupe
lumbermill, sawmill
magnetic_compass
mailbag, postbag
mailbox, letter_box
maillot
maillot, tank_suit
manhole_cover
maraca
marimba, xylophone
mask
matchstick
maypole
maze, labyrinth
measuring_cup
medicine_chest, medicine_cabinet
megalith, megalithic_structure
microphone, mike
microwave, microwave_oven
military_uniform
milk_can
minibus
miniskirt, mini
minivan
missile
mitten
mixing_bowl
mobile_home, manufactured_home
Model_T
modem
monastery
monitor
moped
mortar
mortarboard
mosque
mosquito_net
motor_scooter, scooter
mountain_bike, all-terrain_bike, off-roader
mountain_tent
mouse, computer_mouse
mousetrap
moving_van
muzzle
nail
neck_brace
necklace
nipple
notebook, notebook_computer
obelisk
oboe, hautboy, hautbois
ocarina, sweet_potato
odometer, hodometer, mileometer, milometer
oil_filter
organ, pipe_organ
oscilloscope, scope, cathode-ray_oscilloscope, CRO
overskirt
oxcart
oxygen_mask
packet
paddle, boat_paddle
paddlewheel, paddle_wheel
padlock
paintbrush
pajama, pyjama, pj's, jammies
palace
panpipe, pandean_pipe, syrinx
paper_towel
parachute, chute
parallel_bars, bars
park_bench
parking_meter
passenger_car, coach, carriage
patio, terrace
pay-phone, pay-station
pedestal, plinth, footstall
pencil_box, pencil_case
pencil_sharpener
perfume, essence
Petri_dish
photocopier
pick, plectrum, plectron
pickelhaube
picket_fence, paling
pickup, pickup_truck
pier
piggy_bank, penny_bank
pill_bottle
pillow
ping-pong_ball
pinwheel
pirate, pirate_ship
pitcher, ewer
plane, carpenter's_plane, woodworking_plane
planetarium
plastic_bag
plate_rack
plow, plough
plunger, plumber's_helper
Polaroid_camera, Polaroid_Land_camera
pole
police_van, police_wagon, paddy_wagon, patrol_wagon, wagon, black_Maria
poncho
pool_table, billiard_table, snooker_table
pop_bottle, soda_bottle
pot, flowerpot
potter's_wheel
power_drill
prayer_rug, prayer_mat
printer
prison, prison_house
projectile, missile
projector
puck, hockey_puck
punching_bag, punch_bag, punching_ball, punchball
purse
quill, quill_pen
quilt, comforter, comfort, puff
racer, race_car, racing_car
racket, racquet
radiator
radio, wireless
radio_telescope, radio_reflector
rain_barrel
recreational_vehicle, RV, R.V.
reel
reflex_camera
refrigerator, icebox
remote_control, remote
restaurant, eating_house, eating_place, eatery
revolver, six-gun, six-shooter
rifle
rocking_chair, rocker
rotisserie
rubber_eraser, rubber, pencil_eraser
rugby_ball
rule, ruler
running_shoe
safe
safety_pin
saltshaker, salt_shaker
sandal
sarong
sax, saxophone
scabbard
scale, weighing_machine
school_bus
schooner
scoreboard
screen, CRT_screen
screw
screwdriver
seat_belt, seatbelt
sewing_machine
shield, buckler
shoe_shop, shoe-shop, shoe_store
shoji
shopping_basket
shopping_cart
shovel
shower_cap
shower_curtain
ski
ski_mask
sleeping_bag
slide_rule, slipstick
sliding_door
slot, one-armed_bandit
snorkel
snowmobile
snowplow, snowplough
soap_dispenser
soccer_ball
sock
solar_dish, solar_collector, solar_furnace
sombrero
soup_bowl
space_bar
space_heater
space_shuttle
spatula
speedboat
spider_web, spider's_web
spindle
sports_car, sport_car
spotlight, spot
stage
steam_locomotive
steel_arch_bridge
steel_drum
stethoscope
stole
stone_wall
stopwatch, stop_watch
stove
strainer
streetcar, tram, tramcar, trolley, trolley_car
stretcher
studio_couch, day_bed
stupa, tope
submarine, pigboat, sub, U-boat
suit, suit_of_clothes
sundial
sunglass
sunglasses, dark_glasses, shades
sunscreen, sunblock, sun_blocker
suspension_bridge
swab, swob, mop
sweatshirt
swimming_trunks, bathing_trunks
swing
switch, electric_switch, electrical_switch
syringe
table_lamp
tank, army_tank, armored_combat_vehicle, armoured_combat_vehicle
tape_player
teapot
teddy, teddy_bear
television, television_system
tennis_ball
thatch, thatched_roof
theater_curtain, theatre_curtain
thimble
thresher, thrasher, threshing_machine
throne
tile_roof
toaster
tobacco_shop, tobacconist_shop, tobacconist
toilet_seat
torch
totem_pole
tow_truck, tow_car, wrecker
toyshop
tractor
trailer_truck, tractor_trailer, trucking_rig, rig, articulated_lorry, semi
tray
trench_coat
tricycle, trike, velocipede
trimaran
tripod
triumphal_arch
trolleybus, trolley_coach, trackless_trolley
trombone
tub, vat
turnstile
typewriter_keyboard
umbrella
unicycle, monocycle
upright, upright_piano
vacuum, vacuum_cleaner
vase
vault
velvet
vending_machine
vestment
viaduct
violin, fiddle
volleyball
waffle_iron
wall_clock
wallet, billfold, notecase, pocketbook
wardrobe, closet, press
warplane, military_plane
washbasin, handbasin, washbowl, lavabo, wash-hand_basin
washer, automatic_washer, washing_machine
water_bottle
water_jug
water_tower
whiskey_jug
whistle
wig
window_screen
window_shade
Windsor_tie
wine_bottle
wing
wok
wooden_spoon
wool, woolen, woollen
worm_fence, snake_fence, snake-rail_fence, Virginia_fence
wreck
yawl
yurt
web_site, website, internet_site, site
comic_book
crossword_puzzle, crossword
street_sign
traffic_light, traffic_signal, stoplight
book_jacket, dust_cover, dust_jacket, dust_wrapper
menu
plate
guacamole
consomme
hot_pot, hotpot
trifle
ice_cream, icecream
ice_lolly, lolly, lollipop, popsicle
French_loaf
bagel, beigel
pretzel
cheeseburger
hotdog, hot_dog, red_hot
mashed_potato
head_cabbage
broccoli
cauliflower
zucchini, courgette
spaghetti_squash
acorn_squash
butternut_squash
cucumber, cuke
artichoke, globe_artichoke
bell_pepper
cardoon
mushroom
Granny_Smith
strawberry
orange
lemon
fig
pineapple, ananas
banana
jackfruit, jak, jack
custard_apple
pomegranate
hay
carbonara
chocolate_sauce, chocolate_syrup
dough
meat_loaf, meatloaf
pizza, pizza_pie
potpie
burrito
red_wine
espresso
cup
eggnog
alp
bubble
cliff, drop, drop-off
coral_reef
geyser
lakeside, lakeshore
promontory, headland, head, foreland
sandbar, sand_bar
seashore, coast, seacoast, sea-coast
valley, vale
volcano
ballplayer, baseball_player
groom, bridegroom
scuba_diver
rapeseed
daisy
yellow_lady's_slipper, yellow_lady-slipper, Cypripedium_calceolus, Cypripedium_parviflorum
corn
acorn
hip, rose_hip, rosehip
buckeye, horse_chestnut, conker
coral_fungus
agaric
gyromitra
stinkhorn, carrion_fungus
earthstar
hen-of-the-woods, hen_of_the_woods, Polyporus_frondosus, Grifola_frondosa
bolete
ear, spike, capitulum
toilet_tissue, toilet_paper, bathroom_tissue


================================================
FILE: github_adventures/vision_transformer/custom.py
================================================
import torch
import torch.nn as nn


class PatchEmbed(nn.Module):
    """Split image into patches and then embed them.

    Parameters
    ----------
    img_size : int
        Size of the image (it is a square).

    patch_size : int
        Size of the patch (it is a square).

    in_chans : int
        Number of input channels.

    embed_dim : int
        The emmbedding dimension.

    Attributes
    ----------
    n_patches : int
        Number of patches inside of our image.

    proj : nn.Conv2d
        Convolutional layer that does both the splitting into patches
        and their embedding.
    """
    def __init__(self, img_size, patch_size, in_chans=3, embed_dim=768):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size) ** 2


        self.proj = nn.Conv2d(
                in_chans,
                embed_dim,
                kernel_size=patch_size,
                stride=patch_size,
        )

    def forward(self, x):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, in_chans, img_size, img_size)`.

        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches, embed_dim)`.
        """
        x = self.proj(
                x
            )  # (n_samples, embed_dim, n_patches ** 0.5, n_patches ** 0.5)
        x = x.flatten(2)  # (n_samples, embed_dim, n_patches)
        x = x.transpose(1, 2)  # (n_samples, n_patches, embed_dim)

        return x


class Attention(nn.Module):
    """Attention mechanism.

    Parameters
    ----------
    dim : int
        The input and out dimension of per token features.

    n_heads : int
        Number of attention heads.

    qkv_bias : bool
        If True then we include bias to the query, key and value projections.

    attn_p : float
        Dropout probability applied to the query, key and value tensors.

    proj_p : float
        Dropout probability applied to the output tensor.


    Attributes
    ----------
    scale : float
        Normalizing consant for the dot product.

    qkv : nn.Linear
        Linear projection for the query, key and value.

    proj : nn.Linear
        Linear mapping that takes in the concatenated output of all attention
        heads and maps it into a new space.

    attn_drop, proj_drop : nn.Dropout
        Dropout layers.
    """
    def __init__(self, dim, n_heads=12, qkv_bias=True, attn_p=0., proj_p=0.):
        super().__init__()
        self.n_heads = n_heads
        self.dim = dim
        self.head_dim = dim // n_heads
        self.scale = self.head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_p)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_p)

    def forward(self, x):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, n_patches + 1, dim)`.

        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches + 1, dim)`.
        """
        n_samples, n_tokens, dim = x.shape

        if dim != self.dim:
            raise ValueError

        qkv = self.qkv(x)  # (n_samples, n_patches + 1, 3 * dim)
        qkv = qkv.reshape(
                n_samples, n_tokens, 3, self.n_heads, self.head_dim
        )  # (n_smaples, n_patches + 1, 3, n_heads, head_dim)
        qkv = qkv.permute(
                2, 0, 3, 1, 4
        )  # (3, n_samples, n_heads, n_patches + 1, head_dim)

        q, k, v = qkv[0], qkv[1], qkv[2]
        k_t = k.transpose(-2, -1)  # (n_samples, n_heads, head_dim, n_patches + 1)
        dp = (
           q @ k_t
        ) * self.scale # (n_samples, n_heads, n_patches + 1, n_patches + 1)
        attn = dp.softmax(dim=-1)  # (n_samples, n_heads, n_patches + 1, n_patches + 1)
        attn = self.attn_drop(attn)

        weighted_avg = attn @ v  # (n_samples, n_heads, n_patches +1, head_dim)
        weighted_avg = weighted_avg.transpose(
                1, 2
        )  # (n_samples, n_patches + 1, n_heads, head_dim)
        weighted_avg = weighted_avg.flatten(2)  # (n_samples, n_patches + 1, dim)

        x = self.proj(weighted_avg)  # (n_samples, n_patches + 1, dim)
        x = self.proj_drop(x)  # (n_samples, n_patches + 1, dim)

        return x


class MLP(nn.Module):
    """Multilayer perceptron.

    Parameters
    ----------
    in_features : int
        Number of input features.

    hidden_features : int
        Number of nodes in the hidden layer.

    out_features : int
        Number of output features.

    p : float
        Dropout probability.

    Attributes
    ----------
    fc : nn.Linear
        The First linear layer.

    act : nn.GELU
        GELU activation function.

    fc2 : nn.Linear
        The second linear layer.

    drop : nn.Dropout
        Dropout layer.
    """
    def __init__(self, in_features, hidden_features, out_features, p=0.):
        super().__init__()
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(p)

    def forward(self, x):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, n_patches + 1, in_features)`.

        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches +1, out_features)`
        """
        x = self.fc1(
                x
        ) # (n_samples, n_patches + 1, hidden_features)
        x = self.act(x)  # (n_samples, n_patches + 1, hidden_features)
        x = self.drop(x)  # (n_samples, n_patches + 1, hidden_features)
        x = self.fc2(x)  # (n_samples, n_patches + 1, out_features)
        x = self.drop(x)  # (n_samples, n_patches + 1, out_features)

        return x


class Block(nn.Module):
    """Transformer block.

    Parameters
    ----------
    dim : int
        Embeddinig dimension.

    n_heads : int
        Number of attention heads.

    mlp_ratio : float
        Determines the hidden dimension size of the `MLP` module with respect
        to `dim`.

    qkv_bias : bool
        If True then we include bias to the query, key and value projections.

    p, attn_p : float
        Dropout probability.

    Attributes
    ----------
    norm1, norm2 : LayerNorm
        Layer normalization.

    attn : Attention
        Attention module.

    mlp : MLP
        MLP module.
    """
    def __init__(self, dim, n_heads, mlp_ratio=4.0, qkv_bias=True, p=0., attn_p=0.):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
        self.attn = Attention(
                dim,
                n_heads=n_heads,
                qkv_bias=qkv_bias,
                attn_p=attn_p,
                proj_p=p
        )
        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
        hidden_features = int(dim * mlp_ratio)
        self.mlp = MLP(
                in_features=dim,
                hidden_features=hidden_features,
                out_features=dim,
        )

    def forward(self, x):
        """Run forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, n_patches + 1, dim)`.

        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches + 1, dim)`.
        """
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))

        return x


class VisionTransformer(nn.Module):
    """Simplified implementation of the Vision transformer.

    Parameters
    ----------
    img_size : int
        Both height and the width of the image (it is a square).

    patch_size : int
        Both height and the width of the patch (it is a square).

    in_chans : int
        Number of input channels.

    n_classes : int
        Number of classes.

    embed_dim : int
        Dimensionality of the token/patch embeddings.

    depth : int
        Number of blocks.

    n_heads : int
        Number of attention heads.

    mlp_ratio : float
        Determines the hidden dimension of the `MLP` module.

    qkv_bias : bool
        If True then we include bias to the query, key and value projections.

    p, attn_p : float
        Dropout probability.

    Attributes
    ----------
    patch_embed : PatchEmbed
        Instance of `PatchEmbed` layer.

    cls_token : nn.Parameter
        Learnable parameter that will represent the first token in the sequence.
        It has `embed_dim` elements.

    pos_emb : nn.Parameter
        Positional embedding of the cls token + all the patches.
        It has `(n_patches + 1) * embed_dim` elements.

    pos_drop : nn.Dropout
        Dropout layer.

    blocks : nn.ModuleList
        List of `Block` modules.

    norm : nn.LayerNorm
        Layer normalization.
    """
    def __init__(
            self,
            img_size=384,
            patch_size=16,
            in_chans=3,
            n_classes=1000,
            embed_dim=768,
            depth=12,
            n_heads=12,
            mlp_ratio=4.,
            qkv_bias=True,
            p=0.,
            attn_p=0.,
    ):
        super().__init__()

        self.patch_embed = PatchEmbed(
                img_size=img_size,
                patch_size=patch_size,
                in_chans=in_chans,
                embed_dim=embed_dim,
        )
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(
                torch.zeros(1, 1 + self.patch_embed.n_patches, embed_dim)
        )
        self.pos_drop = nn.Dropout(p=p)

        self.blocks = nn.ModuleList(
            [
                Block(
                    dim=embed_dim,
                    n_heads=n_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    p=p,
                    attn_p=attn_p,
                )
                for _ in range(depth)
            ]
        )

        self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
        self.head = nn.Linear(embed_dim, n_classes)


    def forward(self, x):
        """Run the forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, in_chans, img_size, img_size)`.

        Returns
        -------
        logits : torch.Tensor
            Logits over all the classes - `(n_samples, n_classes)`.
        """
        n_samples = x.shape[0]
        x = self.patch_embed(x)

        cls_token = self.cls_token.expand(
                n_samples, -1, -1
        )  # (n_samples, 1, embed_dim)
        x = torch.cat((cls_token, x), dim=1)  # (n_samples, 1 + n_patches, embed_dim)
        x = x + self.pos_embed  # (n_samples, 1 + n_patches, embed_dim)
        x = self.pos_drop(x)

        for block in self.blocks:
            x = block(x)

        x = self.norm(x)

        cls_token_final = x[:, 0]  # just the CLS token
        x = self.head(cls_token_final)

        return x


================================================
FILE: github_adventures/vision_transformer/forward.py
================================================
import numpy as np
from PIL import Image
import torch

k = 10

imagenet_labels = dict(enumerate(open("classes.txt")))

model = torch.load("model.pth")
model.eval()

img = (np.array(Image.open("cat.png")) / 128) - 1  # in the range -1, 1
inp = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).to(torch.float32)
logits = model(inp)
probs = torch.nn.functional.softmax(logits, dim=-1)

top_probs, top_ixs = probs[0].topk(k)

for i, (ix_, prob_) in enumerate(zip(top_ixs, top_probs)):
    ix = ix_.item()
    prob = prob_.item()
    cls = imagenet_labels[ix].strip()
    print(f"{i}: {cls:<45} --- {prob:.4f}")


================================================
FILE: github_adventures/vision_transformer/verify.py
================================================
import numpy as np
import timm
import torch
from custom import VisionTransformer

# Helpers
def get_n_params(module):
    return sum(p.numel() for p in module.parameters() if p.requires_grad)

def assert_tensors_equal(t1, t2):
    a1, a2 = t1.detach().numpy(), t2.detach().numpy()

    np.testing.assert_allclose(a1, a2)

model_name = "vit_base_patch16_384"
model_official = timm.create_model(model_name, pretrained=True)
model_official.eval()
print(type(model_official))

custom_config = {
        "img_size": 384,
        "in_chans": 3,
        "patch_size": 16,
        "embed_dim": 768,
        "depth": 12,
        "n_heads": 12,
        "qkv_bias": True,
        "mlp_ratio": 4,
}

model_custom = VisionTransformer(**custom_config)
model_custom.eval()


for (n_o, p_o), (n_c, p_c) in zip(
        model_official.named_parameters(), model_custom.named_parameters()
):
    assert p_o.numel() == p_c.numel()
    print(f"{n_o} | {n_c}")

    p_c.data[:] = p_o.data

    assert_tensors_equal(p_c.data, p_o.data)

inp = torch.rand(1, 3, 384, 384)
res_c = model_custom(inp)
res_o = model_official(inp)

# Asserts
assert get_n_params(model_custom) == get_n_params(model_official)
assert_tensors_equal(res_c, res_o)

# Save custom model
torch.save(model_custom, "model.pth")


================================================
FILE: mini_tutorials/bentoml/README.md
================================================
1. [Resources](#resources)
2. [Installation](#installation)
3. [Instructions](#instructions)
    1. [`bentoml`](#bentoml)
    1. [`bentoctl`](#bentoctl)
    1. [`aws` CLI](#aws-cli)
4. [Sketches](#sketches)

# Resources
* https://docs.bentoml.com/en/latest/
* https://github.com/bentoml/bentoctl
* https://github.com/bentoml/aws-sagemaker-deploy

# Installation
```bash
pip install -r requirements.txt
```
See below the actual versions at the time of making the video
```txt
bentoctl==0.4.0
bentoml==1.1.9
boto3==1.29.0
numpy==1.26.2
pydantic==2.5.1
pydantic_core==2.14.3
scikit-learn==1.3.2
```

# Instructions
## `bentoml`
Creating a model
```bash
python create_model.py
```

Listing all existing models
```bash
bentoml models list
```

Build a bento
```bash
bentoml build
```

List all existing bentos
```bash
bentoml list
```

Serve a bento locally
```bash
bentoml serve $BENTO
```

Serve a `service.py` (development)
```bash
bentoml serve service.py
```

## `bentoctl`
Install SageMaker operator
```bash
bentoctl operator install aws-sagemaker
```

Initialize
```bash
bentoctl init
```

ATTENTION: All of the below assumes that you have correctly set up AWS
secret keys and permissions.

Build custom customized SageMaker image and push to ECR

```bash
bentoctl build -f deployment_config.yaml -b $BENTO
```


Initialize terraform
```bash
terraform init
```

Look at what changes will be applied

```bash
terraform plan -var-file=bentoctl.tfvars
```

Actually apply changes
```bash
terraform apply -var-file=bentoctl.tfvars
```

Send request to the API Gateway
```bash
curl -X 'POST' "$URL/classify"   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{            
  "sepal_width": 0,
  "sepal_length": 0,
  "petal_width": 0,
  "petal_length": 0
}'
```

Destroy resources (not including ECR)

```bash
terraform destroy -var-file=bentoctl.tfvars
```

Destroy resources  including ECR)

```bash
bentoctl destroy
```

## `aws` CLI
Describe repositories
```bash
aws ecr describe-repositories
```

List all images in the repository `amazing-iris`
```bash
aws ecr list-images --repository-name=amazing-iris
```

List SageMaker models
```bash
aws sagemaker list-models
```

List SageMaker endpoints
```bash
aws sagemaker list-endpoints
```


# Sketches
<img width="1250" alt="bentoml-overview" src="https://github.com/jankrepl/mildlyoverfitted/assets/18519371/eeb60c7f-2bbd-40df-9d89-95c0a720c16b">
<img width="1069" alt="sklearn-sagemaker" src="https://github.com/jankrepl/mildlyoverfitted/assets/18519371/58848152-cffb-4ec2-8b8f-b25a3d6647c0">


================================================
FILE: mini_tutorials/bentoml/bentofile.yaml
================================================
service: "service:svc"
include:
- "service.py"
python:
  packages:
  - pydantic
  - scikit-learn
models:
- iris_clf:latest


================================================
FILE: mini_tutorials/bentoml/create_model.py
================================================
import bentoml

from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()
X, y = iris.data, iris.target

clf = svm.SVC(gamma="scale")
clf.fit(X, y)

saved_model = bentoml.sklearn.save_model("iris_clf", clf)
print(saved_model)


================================================
FILE: mini_tutorials/bentoml/requirements.txt
================================================
bentoctl
bentoml
boto3
numpy
pydantic
scikit-learn


================================================
FILE: mini_tutorials/bentoml/service.py
================================================
from typing import Literal

import bentoml

from pydantic import BaseModel
from bentoml.io import JSON


iris_clf_runner = bentoml.sklearn.get("iris_clf:latest").to_runner()

svc = bentoml.Service("iris_classifier", runners=[iris_clf_runner])

class Request(BaseModel):
    sepal_width: float
    sepal_length: float
    petal_width: float
    petal_length: float

class Response(BaseModel):
    label: Literal["setosa", "versicolor", "virginica"]


@svc.api(input=JSON(pydantic_model=Request), output=JSON(pydantic_model=Response))
def classify(request: Request) -> Response:
    input_ = [
        request.sepal_width,
        request.sepal_length,
        request.petal_width,
        request.petal_length,
    ]

    label_index = iris_clf_runner.predict.run([input_])[0]
    label = ["setosa", "versicolor", "virginica"][label_index]

    return Response(label=label)


================================================
FILE: mini_tutorials/custom_optimizer_in_pytorch/custom.py
================================================
import numpy as np
import torch
from torch.optim import Optimizer

class WeirdDescent(Optimizer):
    """Take a coordinate descent step for a random parameter.

    And also, make every 100th step way bigger.
    """
    def __init__(self, parameters, lr=1e-3):
        defaults = {"lr": lr}
        super().__init__(parameters, defaults)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()

        if not self.state:
            self.state["step"] = 1
        else:
            self.state["step"] += 1

        c = 1
        if self.state["step"] % 100 == 0:
            c = 100

        grad = None
        while grad is None:
            param_group = np.random.choice(self.param_groups)
            tensor = np.random.choice(param_group["params"])
            grad = tensor.grad.data

        element_ix = np.random.randint(tensor.numel())

        mask_flat = torch.zeros(tensor.numel())
        mask_flat[element_ix] = 1
        mask = mask_flat.reshape(tensor.shape)

        tensor.data.add_(grad * mask, alpha=-param_group["lr"] * c)

        return loss


================================================
FILE: mini_tutorials/custom_optimizer_in_pytorch/src.py
================================================
from matplotlib.animation import FuncAnimation
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.optim import Adam, SGD
from tqdm import tqdm

from custom import WeirdDescent

def rosenbrock(xy):
    """Evaluate Rosenbrock function.

    Parameters
    ----------
    xy : tuple
        Two element tuple of floats representing the x resp. y coordinates.

    Returns
    -------
    float
        The Rosenbrock function evaluated at the point `xy`.
    """
    x, y = xy

    return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2

def run_optimization(xy_init, optimizer_class, n_iter, **optimizer_kwargs):
    """Run optimization finding the minimum of the Rosenbrock function.

    Parameters
    ----------
    xy_init : tuple
        Two floats representing the x resp. y coordinates.

    optimizer_class : object
        Optimizer class.

    n_iter : int
        Number of iterations to run the optimization for.

    optimizer_kwargs : dict
        Additional parameters to be passed into the optimizer.

    Returns
    -------
    path : np.ndarray
        2D array of shape `(n_iter + 1, 2)`. Where the rows represent the
        iteration and the columns represent the x resp. y coordinates.
    """
    xy_t = torch.tensor(xy_init, requires_grad=True)
    optimizer = optimizer_class([xy_t], **optimizer_kwargs)

    path = np.empty((n_iter + 1, 2))
    path[0, :] = xy_init

    for i in tqdm(range(1, n_iter + 1)):
        optimizer.zero_grad()
        loss = rosenbrock(xy_t)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(xy_t, 1.0)
        optimizer.step()

        path[i, :] = xy_t.detach().numpy()

    return path

def create_animation(paths,
                     colors,
                     names,
                     figsize=(12, 12),
                     x_lim=(-2, 2),
                     y_lim=(-1, 3),
                     n_seconds=5):
    """Create an animation.

    Parameters
    ----------
    paths : list
        List of arrays representing the paths (history of x,y coordinates) the
        optimizer went through.

    colors :  list
        List of strings representing colors for each path.

    names : list
        List of strings representing names for each path.

    figsize : tuple
        Size of the figure.

    x_lim, y_lim : tuple
        Range of the x resp. y axis.

    n_seconds : int
        Number of seconds the animation should last.

    Returns
    -------
    anim : FuncAnimation
        Animation of the paths of all the optimizers.
    """
    if not (len(paths) == len(colors) == len(names)):
        raise ValueError

    path_length = max(len(path) for path in paths)

    n_points = 300
    x = np.linspace(*x_lim, n_points)
    y = np.linspace(*y_lim, n_points)
    X, Y = np.meshgrid(x, y)
    Z = rosenbrock([X, Y])

    minimum = (1.0, 1.0)

    fig, ax = plt.subplots(figsize=figsize)
    ax.contour(X, Y, Z, 90, cmap="jet")

    scatters = [ax.scatter(None,
                           None,
                           label=label,
                           c=c) for c, label in zip(colors, names)]

    ax.legend(prop={"size": 25})
    ax.plot(*minimum, "rD")

    def animate(i):
        for path, scatter in zip(paths, scatters):
            scatter.set_offsets(path[:i, :])

        ax.set_title(str(i))

    ms_per_frame = 1000 * n_seconds / path_length

    anim = FuncAnimation(fig, animate, frames=path_length, interval=ms_per_frame)

    return anim

if __name__ == "__main__":
    xy_init = (.3, .8)
    n_iter = 1500

    path_adam = run_optimization(xy_init, Adam, n_iter)
    path_sgd = run_optimization(xy_init, SGD, n_iter, lr=1e-3)
    path_weird = run_optimization(xy_init, WeirdDescent, n_iter, lr=1e-3)

    freq = 10

    paths = [path_adam[::freq], path_sgd[::freq], path_weird[::freq]]
    colors = ["green", "blue", "black"]
    names = ["Adam", "SGD", "Weird"]

    anim = create_animation(paths,
                            colors,
                            names,
                            figsize=(12, 7),
                            x_lim=(-.1, 1.1),
                            y_lim=(-.1, 1.1),
                            n_seconds=7)

    anim.save("result.gif")
    print(path_weird[-15:])


================================================
FILE: mini_tutorials/deploying_on_kubernetes/Dockerfile
================================================
FROM huggingface/transformers-pytorch-gpu

RUN python3 -c "from transformers import AutoModel;AutoModel.from_pretrained('bert-base-uncased')"
RUN python3 -c "from transformers import AutoTokenizer;AutoTokenizer.from_pretrained('bert-base-uncased')"

RUN pip install fastapi uvicorn

EXPOSE 8888
ENTRYPOINT ["transformers-cli", "serve", "--port=8888", "--host=0.0.0.0", "--task=fill-mask", "--model=bert-base-uncased"]


================================================
FILE: mini_tutorials/deploying_on_kubernetes/DockerfileConda
================================================
FROM continuumio/miniconda3

RUN conda install -c conda-forge pytorch-cpu
RUN conda install -c conda-forge fastapi
RUN conda install -c conda-forge uvicorn
RUN conda install -c huggingface transformers
RUN conda install -c conda-forge huggingface_hub=0.2.1

RUN python3 -c "from transformers import AutoModel;AutoModel.from_pretrained('bert-base-uncased')"
RUN python3 -c "from transformers import AutoTokenizer;AutoTokenizer.from_pretrained('bert-base-uncased')"


EXPOSE 8888
ENTRYPOINT ["transformers-cli", "serve", "--port=8888", "--host=0.0.0.0", "--task=fill-mask", "--model=bert-base-uncased"]


================================================
FILE: mini_tutorials/deploying_on_kubernetes/README.md
================================================
# Relevant commands

## Creating an API
```bash
transformers-cli serve --task=fill-mask --model=bert-base-uncased
```

```bash
curl http://localhost:8888 | jq 
```

```bash
curl -X POST http://localhost:8888/forward  -H "accept: application/json" -H "Content-Type: application/json" -d '{"inputs": "Today is going to be a [MASK] day"}' | jq 
```

## Containerization
Build first image.
```bash
docker build -t cool-api:v1 .
```

Build second image.
```bash
docker build -t cool-api:v2 -f DockerfileConda .
```

Run image.
```bash
docker run -it --rm -P cool-api:v2
```

## Deploying on Kubernetes
Start a minikube cluster.
```bash
minikube start
```

Get all objects across all namespaces.
```bash
kubectl get all -A
```

List images.
```bash
minikube image list
```

Load an image.
```bash
minikube image cool-api:v2
```

Create a deployment.
```bash
kubectl create deploy cool-deploy --image=cool-api:v2
```

Create a service.
```bash
kubectl expose deploy/cool-deploy --name=cool-service --target-port=8888 --port=1234
```

Scale up.
```bash
kubectl scale deploy/cool-deploy --replicas=3
```

Get logs.
```bash
kubectl logs -f PODFULLNAME
```


================================================
FILE: mini_tutorials/embedding/README.md
================================================
# Training data
The Dracula book can be found here: https://archive.org/stream/draculabr00stokuoft/draculabr00stokuoft_djvu.txt


================================================
FILE: mini_tutorials/embedding/Visualize.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "incredible-backup",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ipywidgets\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "proud-accreditation",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"res.csv\")\n",
    "last_epoch = df[\"epoch\"].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "canadian-nightlife",
   "metadata": {},
   "outputs": [],
   "source": [
    "@ipywidgets.interact\n",
    "def f(epoch=ipywidgets.IntSlider(min=0, max=last_epoch , continuous_update=False)):\n",
    "    fig, ax = plt.subplots(1, 1, figsize=(12, 8))\n",
    "    ax.set_xlim([-2, 2])\n",
    "    ax.set_ylim([-2, 2])\n",
    "    df_iter = df[df[\"epoch\"] == epoch]\n",
    "    df_iter.plot(kind='scatter', x='dim_0',y='dim_1', ax=ax, c=\"red\")\n",
    "    df_iter[['dim_0','dim_1','character']].apply(lambda row:\n",
    "                                                 ax.text(row[\"dim_0\"] + 0.02,\n",
    "                                                         row[\"dim_1\"] + 0.01,\n",
    "                                                         row[\"character\"],\n",
    "                                                         fontsize=18),\n",
    "                                                 axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "early-vinyl",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: mini_tutorials/embedding/src.py
================================================
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import torch
from torch.nn import Embedding, Linear, LSTM, Module
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from tqdm import tqdm


class CharacterDataset(Dataset):
    """Custom dataset.

    Parameters
    ----------
    text : str
        Input text that will be used to create the entire database.

    window_size : int
        Number of characters to use as input features.

    vocab_size : int
        Number of characters in the vocabulary. Note that the last character
        is always reserved for a special "~" out-of-vocabulary character.

    Attributes
    ----------
    ch2ix : defaultdict
        Mapping from the character to the position of that character in the
        vocabulary. Note that all characters that are not in the vocabulary
        will get mapped into the index `vocab_size - 1`.

    ix2ch : dict
        Mapping from the character position in the vocabulary to the actual
        character.

    vocabulary : list
        List of all characters. `len(vocabulary) == vocab_size`.
    """
    def __init__(self, text, window_size=1, vocab_size=50):
        self.text = text.replace("\n", " ")
        self.window_size = window_size
        self.ch2ix = defaultdict(lambda: vocab_size - 1)

        most_common_ch2ix = {
            x[0]: i
            for i, x in enumerate(Counter(self.text).most_common()[: (vocab_size - 1)])
        }
        self.ch2ix.update(most_common_ch2ix)
        self.ch2ix["~"] = vocab_size - 1

        self.ix2ch = {v: k for k, v in self.ch2ix.items()}
        self.vocabulary = [self.ix2ch[i] for i in range(vocab_size)]

    def __len__(self):
        return len(self.text) - self.window_size

    def __getitem__(self, ix):
        X = torch.LongTensor(
            [self.ch2ix[c] for c in self.text[ix : ix + self.window_size]]
        )
        y = self.ch2ix[self.text[ix + self.window_size]]

        return X, y


class Network(Module):
    """Custom network predicting the next character of a string.

    Parameters
    ----------
    vocab_size : int
        The number of characters in the vocabulary.

    embedding_dim : int
        Dimension of the character embedding vectors.

    dense_dim : int
        Number of neurons in the linear layer that follows the LSTM.

    hidden_dim : int
        Size of the LSTM hidden state.

    max_norm : int
        If any of the embedding vectors has a higher L2 norm than `max_norm`
        it is rescaled.

    n_layers : int
        Number of the layers of the LSTM.
    """
    def __init__(
        self,
        vocab_size,
        embedding_dim=2,
        dense_dim=32,
        hidden_dim=8,
        max_norm=2,
        n_layers=1,
    ):
        super().__init__()

        self.embedding = Embedding(
                vocab_size,
                embedding_dim,
                padding_idx=vocab_size - 1,
                norm_type=2,
                max_norm=max_norm,
        )
        self.lstm = LSTM(
                embedding_dim, hidden_dim, batch_first=True, num_layers=n_layers
        )
        self.linear_1 = Linear(hidden_dim, dense_dim)
        self.linear_2 = Linear(dense_dim, vocab_size)


    def forward(self, x, h=None, c=None):
        """Run the forward pass.

        Parameters
        ----------
        x : torch.Tensor
            Input tensor of shape `(n_samples, window_size)` of dtype
            `torch.int64`.

        h, c : torch.Tensor or None
            Hidden states of the LSTM.

        Returns
        -------
        logits : torch.Tensor
            Tensor of shape `(n_samples, vocab_size)`.

        h, c : torch.Tensor or None
            Hidden states of the LSTM.
        """
        emb = self.embedding(x)  # (n_samples, window_size, embedding_dim)
        if h is not None and c is not None:
            _, (h, c) = self.lstm(emb, (h, c))
        else:
            _, (h, c) = self.lstm(emb)  # (n_layers, n_samples, hidden_dim)

        h_mean = h.mean(dim=0)  # (n_samples, hidden_dim)
        x = self.linear_1(h_mean)  # (n_samples, dense_dim)
        logits = self.linear_2(x)  # (n_samples, vocab_size)

        return logits, h, c

def compute_loss(cal, net, dataloader):
    """Computer average loss over a dataset."""
    net.eval()
    all_losses = []
    for X_batch, y_batch in dataloader:
        probs, _, _ = net(X_batch)

        all_losses.append(cal(probs, y_batch).item())

    return np.mean(all_losses)

def generate_text(n_chars, net, dataset, initial_text="Hello", random_state=None):
    """Generate text with the character-level model.

    Parameters
    ----------
    n_chars : int
        Number of characters to generate.

    net : Module
        Character-level model.

    dataset : CharacterDataset
        Instance of the `CharacterDataset`.

    initial_text : str
        The starting text to be used as the initial condition for the model.

    random_state : None or int
        If not None, then the result is reproducible.

    Returns
    -------
    res : str
        Generated text.
    """
    if not initial_text:
        raise ValueError("You need to specify the initial text")

    res = initial_text
    net.eval()
    h, c = None, None

    if random_state is not None:
        np.random.seed(random_state)

    for _ in range(n_chars):
        previous_chars = initial_text if res == initial_text else res[-1]
        features = torch.LongTensor([[dataset.ch2ix[c] for c in previous_chars]])
        logits, h, c = net(features, h, c)
        probs = F.softmax(logits[0], dim=0).detach().numpy()
        new_ch = np.random.choice(dataset.vocabulary, p=probs)
        res += new_ch

    return res

if __name__ == "__main__":
    with open("text.txt", "r") as f:
        text = "\n".join(f.readlines())

    # Hyperparameters model
    vocab_size = 70
    window_size = 10
    embedding_dim = 2
    hidden_dim = 16
    dense_dim = 32
    n_layers = 1
    max_norm = 2

    # Training config
    n_epochs = 25
    train_val_split = 0.8
    batch_size = 128
    random_state = 13

    torch.manual_seed(random_state)

    loss_f = torch.nn.CrossEntropyLoss()
    dataset = CharacterDataset(text, window_size=window_size, vocab_size=vocab_size)

    n_samples = len(dataset)
    split_ix = int(n_samples * train_val_split)

    train_indices, val_indices = np.arange(split_ix), np.arange(split_ix, n_samples)

    train_dataloader = DataLoader(
            dataset, sampler=SubsetRandomSampler(train_indices), batch_size=batch_size
    )
    val_dataloader = DataLoader(
            dataset, sampler=SubsetRandomSampler(val_indices), batch_size=batch_size
    )

    net = Network(
            vocab_size,
            hidden_dim=hidden_dim,
            n_layers=n_layers,
            dense_dim=dense_dim,
            embedding_dim=embedding_dim,
            max_norm=max_norm,
    )
    optimizer = torch.optim.Adam(
            net.parameters(),
            lr=1e-2,
    )

    emb_history = []

    for e in range(n_epochs + 1):
        net.train()
        for X_batch, y_batch in tqdm(train_dataloader):
            if e == 0:
                break

            optimizer.zero_grad()
            probs, _, _ = net(X_batch)
            loss = loss_f(probs, y_batch)
            loss.backward()

            optimizer.step()

        train_loss = compute_loss(loss_f, net, train_dataloader)
        val_loss = compute_loss(loss_f, net, val_dataloader)
        print(f"Epoch: {e}, {train_loss=:.3f}, {val_loss=:.3f}")

        # Generate one sentence
        initial_text = "I hope it works "
        generated_text = generate_text(
            100, net, dataset, initial_text=initial_text, random_state=random_state
        )
        print(generated_text)

        # Prepare DataFrame
        weights = net.embedding.weight.detach().clone().numpy()

        df = pd.DataFrame(weights, columns=[f"dim_{i}" for i in range(embedding_dim)])
        df["epoch"] = e
        df["character"] = dataset.vocabulary

        emb_history.append(df)

final_df = pd.concat(emb_history)
final_df.to_csv("res.csv", index=False)


================================================
FILE: mini_tutorials/fewshot_text_classification/classify.py
================================================
import pathlib

import jinja2
import openai


path = pathlib.Path("template.jinja2")

with path.open() as f:
    prompt_template = jinja2.Template(f.read())

labels = [
    {"label": 0, "description": "negative sentiment"},
    {"label": 1, "description": "neutral sentiment"},
    {"label": 2, "description": "positive sentiment"},
]

examples = [
    {"text": "Today was a horrible day", "label": 0},
    {"text": "Yesterday was a great day", "label": 2},
]

text = "I loved the TV show"

prompt = prompt_template.render(
    examples=examples,
    labels=labels,
    text=text,
)
print(prompt)

completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
  ]
)

print(completion.choices[0].message)


================================================
FILE: mini_tutorials/fewshot_text_classification/template.jinja2
================================================
I want you to classify text for me.
See below all the possible labels and their description
{% for item in labels %}
"""
description: {{ item.description }}
label: {{ item.label }}
"""
{% endfor %}
{% if examples %}
See below a couple of examples
{% for item in examples %}
"""
text: {{ item.text }}
label: {{ item.label }}
"""
{% endfor %}
{% endif %}

Here is the text that needs to be classified
"""
text: {{ text }}
label:


================================================
FILE: mini_tutorials/gradient_wrt_input/explain.py
================================================
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision.models as models

from utils import compute_gradient, read_image, scale_grad, to_array


def func(inp, net=None, target=None):
    """Get logit of a target class.

    Parameters
    ----------
    inp : torch.Tensor
        Input image (single image batch).

    net : torch.nn.Module
        Classifier network.

    target : int
        Imagenet ground truth label id.

    Returns
    -------
    logit : torch.Tensor
        Logit of the `target` class.
    """
    out = net(inp)
    logit = out[0, target]

    return logit

def compute_integrated_gradients(inp, baseline, net, target, n_steps=100):
    """Compute integrated gradients.

    Parameters
    ----------
    inp : torch.Tensor
        Input image (single image batch) of shape `(1, 3, *, *)`.

    baseline : torch.Tensor
        Basline image of the same shape as the `inp`.

    net : torch.nn.Module
        Classifier network.

    target : int
        Imagenet ground truth label id.

    n_steps : int
        Number of steps between the `inp` and `baseline` tensors.

    Returns
    -------
    ig : torch.Tensor
        Integrated gradients with the same shape as the `inp`.

    inp_grad : torch.Tensor
        Gradient with respect to the `inp` tensor. Same shape as `inp`.
    """
    path = [baseline + a * (inp - baseline) for a in np.linspace(0, 1, n_steps)]
    grads = [compute_gradient(func, x, net=net, target=target) for x in path]

    ig = (inp - baseline) * torch.cat(grads[:-1]).mean(dim=0, keepdims=True)

    return ig, grads[-1]

if __name__ == "__main__":
    net = models.resnet18(pretrained=True)
    net.eval()

    tensor = read_image("img.jpg")
    arr = to_array(tensor)

    n_steps = 100
    baseline = -1.5 * torch.ones_like(tensor)

    ig, inp_grad = compute_integrated_gradients(
            tensor, baseline, net, 291, n_steps=n_steps
    )

    ig_scaled = scale_grad(ig)
    inp_grad_scaled = scale_grad(inp_grad)

    _, (ax_baseline, ax_img, ax_inp_grad, ax_ig) = plt.subplots(1, 4, figsize=(19.20,10.80))

    ax_baseline.imshow(to_array(baseline))
    ax_img.imshow(arr)
    ax_inp_grad.imshow(arr * inp_grad_scaled)
    ax_ig.imshow(arr * ig_scaled)

    ax_baseline.set_title("Baseline")
    ax_img.set_title("Input")
    ax_inp_grad.set_title("Gradient input")
    ax_ig.set_title("Integrated gradients")

    ax_baseline.axis("off")
    ax_img.axis("off")
    ax_inp_grad.axis("off")
    ax_ig.axis("off")

    plt.savefig("res_2.png")


================================================
FILE: mini_tutorials/gradient_wrt_input/fool.py
================================================
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision.models as models

from utils import compute_gradient, read_image, to_array

def func(inp, net=None, target=None):
    """Compute negative log likelihood.

    Parameters
    ----------
    inp : torch.Tensor
        Input image (single image batch).

    net : torch.nn.Module
        Classifier network.

    target : int
        Imagenet ground truth label id.

    Returns
    -------
    loss : torch.Tensor
        Loss for the `inp` image.
    """
    out = net(inp)
    loss = torch.nn.functional.nll_loss(out, target=torch.LongTensor([target]))

    print(f"Loss: {loss.item()}")
    return loss

def attack(tensor, net, eps=1e-3, n_iter=50):
    """Run the Fast Sign Gradient Method (FSGM) attack.

    Parameters
    ----------
    tensor : torch.Tensor
        The input image of shape `(1, 3, 224, 224)`.

    net : torch.nn.Module
        Classifier network.

    eps : float
        Determines how much we modify the image in a single iteration.

    n_iter : int
        Number of iterations.

    Returns
    -------
    new_tensor : torch.Tensor
        New image that is a modification of the input image that "fools"
        the classifier.
    """
    new_tensor = tensor.detach().clone()

    orig_prediction = net(tensor).argmax()
    print(f"Original prediction: {orig_prediction.item()}")

    for i in range(n_iter):
        net.zero_grad()

        grad = compute_gradient(
                func, new_tensor, net=net, target=orig_prediction.item()
                )
        new_tensor = torch.clamp(new_tensor + eps * grad.sign(), -2, 2)
        new_prediction = net(new_tensor).argmax()

        if orig_prediction != new_prediction:
            print(f"We fooled the network after {i} iterations!")
            print(f"New prediction: {new_prediction.item()}")
            break

    return new_tensor, orig_prediction.item(), new_prediction.item()


if __name__ == "__main__":
    net = models.resnet18(pretrained=True)
    net.eval()

    tensor = read_image("img.jpg")

    new_tensor, orig_prediction, new_prediction = attack(
            tensor, net, eps=1e-3, n_iter=100
            )

    _, (ax_orig, ax_new, ax_diff) = plt.subplots(1, 3, figsize=(19.20,10.80))
    arr = to_array(tensor)
    new_arr = to_array(new_tensor)
    diff_arr = np.abs(arr - new_arr).mean(axis=-1)
    diff_arr = diff_arr / diff_arr.max()

    ax_orig.imshow(arr)
    ax_new.imshow(new_arr)
    ax_diff.imshow(diff_arr, cmap="gray")

    ax_orig.axis("off")
    ax_new.axis("off")
    ax_diff.axis("off")

    ax_orig.set_title(f"Original: {orig_prediction}")
    ax_new.set_title(f"Modified: {new_prediction}")
    ax_diff.set_title("Difference")

    plt.savefig("res_1.png")


================================================
FILE: mini_tutorials/gradient_wrt_input/utils.py
================================================
from PIL import Image
import torch
from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
                                    ToTensor)

def compute_gradient(func, inp, **kwargs):
    """Compute the gradient with respect to `inp`.

    Parameters
    ----------
    func : callable
        Function that takes in `inp` and `kwargs` and returns a single element
        tensor.

    inp : torch.Tensor
        The tensor that we want to get the gradients for. Needs to be a leaf
        node.

    **kwargs : dict
        Additional keyword arguments passed into `func`.

    Returns
    -------
    grad : torch.Tensor
        Tensor of the same shape as `inp` that is representing the gradient.
    """
    inp.requires_grad = True

    loss = func(inp, **kwargs)
    loss.backward()

    inp.requires_grad = False

    return inp.grad.data


def read_image(path):
    """Load image from disk and convert to torch.Tensor.

    Parameters
    ----------
    path : str
        Path to the image.

    Returns
    -------
    tensor : torch.Tensor
        Single sample batch containing our image (ready to be used with
        pretrained networks). The shape is `(1, 3, 224, 224)`.
    """
    img = Image.open(path)

    transform = Compose([Resize(256),
                         CenterCrop(224),
                         ToTensor(),
                         Normalize(mean=[0.485, 0.456, 0.406],
                                   std=[0.229, 0.224, 0.225])])

    tensor_ = transform(img)
    tensor = tensor_.unsqueeze(0)

    return tensor

def to_array(tensor):
    """Convert torch.Tensor to np.ndarray.

    Parameters
    ----------
    tensor : torch.Tensor
        Tensor of shape `(1, 3, *, *)` representing one sample batch of images.

    Returns
    -------
    arr : np.ndarray
        Array of shape `(*, *, 3)` representing an image that can be plotted
        directly.
    """
    tensor_ = tensor.squeeze()

    unnormalize_transform = Compose([Normalize(mean=[0, 0, 0],
                                               std=[1 / 0.229, 1 / 0.224, 1 / 0.225]),
                                     Normalize(mean=[-0.485, -0.456, -0.406],
                                               std=[1, 1, 1])])
    arr_ = unnormalize_transform(tensor_)
    arr = arr_.permute(1, 2, 0).detach().numpy()

    return arr

def scale_grad(grad):
    """Scale gradient tensor.

    Parameters
    ----------
    grad : torch.Tensor
        Gradient of shape `(1, 3, *, *)`.

    Returns
    -------
    grad_arr : np.ndarray
        Array of shape `(*, *, 1)`.
    """
    grad_arr = torch.abs(grad).mean(dim=1).detach().permute(1, 2, 0)
    grad_arr /= grad_arr.quantile(0.98)
    grad_arr = torch.clamp(grad_arr, 0, 1)

    return grad_arr.numpy()


================================================
FILE: mini_tutorials/haiku_basics/buffers_in_torch.py
================================================
import torch
bn = torch.nn.BatchNorm1d(5)
bn.state_dict()

for name, p in bn.named_buffers():
    print(name, p, p.requires_grad)

for name, p in bn.named_parameters():
    print(name, p, p.requires_grad)


================================================
FILE: mini_tutorials/haiku_basics/parameter.py
================================================
from __future__ import annotations

import haiku as hk
import jax
import jax.numpy as jnp


def foo(x: jnp.ndarray) -> jnp.ndarray:
    c = hk.get_parameter("c", x.shape, init=hk.initializers.RandomNormal(1))

    res = c + x

    key = hk.next_rng_key()
    mask = jax.random.bernoulli(key, 0.5, x.shape)

    return res * mask * 2


foo_transformed = hk.transform(foo)

init_key = jax.random.PRNGKey(24)
apply_key_seq = hk.PRNGSequence(init_key)

x = jnp.ones((2, 5))
params = foo_transformed.init(init_key, x)

for _ in range(2):
    res = foo_transformed.apply(params, next(apply_key_seq), x)
    print(res)


================================================
FILE: mini_tutorials/haiku_basics/reallife.py
================================================
from __future__ import annotations

import haiku as hk
import jax
import jax.numpy as jnp

def foo(x: jnp.ndarray) -> jnp.ndarray:
    mlp = hk.nets.MLP([4, 5, 1])

    loss = mlp(x).mean()

    return loss


foo_transformed = hk.without_apply_rng(hk.transform(foo))

init_key = jax.random.PRNGKey(3452)
x = jnp.ones((2, 3))
params = foo_transformed.init(init_key, x)

grad_foo = jax.jit(jax.grad(foo_transformed.apply))

grads = grad_foo(params, x)


================================================
FILE: mini_tutorials/haiku_basics/requirements.txt
================================================
-e git+ssh://git@github.com/deepmind/dm-haiku.git@386efc098fd52a5cf728e7d13442138ab25eb235#egg=dm_haiku
jax==0.3.5
jaxlib==0.3.5


================================================
FILE: mini_tutorials/haiku_basics/state.py
================================================
from __future__ import annotations

import haiku as hk
import jax
import jax.numpy as jnp


def foo(x: jnp.ndarray) -> jnp.ndarray:
    c = hk.get_parameter("c", x.shape, init=hk.initializers.RandomNormal(1))

    counter = hk.get_state(
        "counter", shape=[], dtype=jnp.int32, init=jnp.ones
    )
    hk.set_state("counter", counter + 1)
    res = c + x + counter

    return res 

foo_transformed = hk.transform_with_state(foo)
init_key = jax.random.PRNGKey(32)

x = jnp.ones((2, 5))
params, state = foo_transformed.init(init_key, x)

for i in range(2):
    print(f"After {i} iterations")

    res, state = foo_transformed.apply(params, state, None, x)
    print(state)
    print(res)


================================================
FILE: mini_tutorials/httpx_rate_limiting/script.py
================================================
import asyncio
import logging

import httpx

logger = logging.getLogger()
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.basicConfig(format="%(asctime)s %(name)s %(message)s", level=logging.INFO)


async def send_request(client: httpx.AsyncClient, semaphore: asyncio.Semaphore) -> int:
    url = "https://pokeapi.co/api/v2/pokemon/ditto"
    async with semaphore:
        logger.info("Sending request")
        response = await client.get(url)
        logger.info("Response received")

    return response.status_code


async def main() -> int:
    semaphore = asyncio.Semaphore(5)
    async with httpx.AsyncClient() as client:
        tasks = [asyncio.create_task(send_request(client, semaphore)) for _ in range(10)]
        status_codes = await asyncio.gather(*tasks)

    logger.info("All work done")

    return 0 if all(c == 200 for c in status_codes) else 1


if __name__ == "__main__":
    raise SystemExit(asyncio.run(main()))


================================================
FILE: mini_tutorials/mocking_neural_networks/app.py
================================================
import logging
import sys

import numpy as np
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

def get_top_k(sequence, tokenizer, model, k=10):
    """Get the top k most probable tokens to fill the gap with.

    Parameters
    ----------
    sequence : str
        String containing the [MASK] token.

    tokenizer : BertFastTokenizer
        Tokenizer.

    model : BertForMaskedLM
        Model.

    k : int
        Number of the top results to return.

    Returns
    -------
    top_vocab_indices : torch.Tensor
        1D tensor representing the indices of the top tokens.
    """
    batch_enc = tokenizer(sequence, return_tensors="pt")
    mask_ix = torch.where(batch_enc["input_ids"] == tokenizer.mask_token_id)[1]
    logits = model(**batch_enc).logits

    top_vocab_indices = torch.topk(logits[0, mask_ix.item(), :], k)[1]

    return top_vocab_indices

if __name__ == "__main__":
    logging.disable(logging.WARNING)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

    sequence = sys.argv[1]

    top_indices = get_top_k(sequence, tokenizer, model, 5)
    top_tokens = [tokenizer.decode(torch.tensor([ix])) for ix in top_indices]

    winner = top_tokens[0]
    print(np.random.permutation(top_tokens))
    guess = input("Who do you think is the winner? ").strip()

    if guess == winner:
        print("You won!!!")
    else:
        print("You lost!!!")

    print("\nTrue ranking")
    for i, x in enumerate(top_tokens):
        print(i, x)


================================================
FILE: mini_tutorials/mocking_neural_networks/test.py
================================================
from unittest.mock import Mock

import pytest
import torch
from transformers import (AutoTokenizer, AutoModelForMaskedLM, BatchEncoding,
                          BertForMaskedLM, BertTokenizerFast)

from app import get_top_k

@pytest.mark.parametrize("k", [5, 7])
def test_with_real_objects(k):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

    sequence = "Hello [MASK]"
    res = get_top_k(sequence, tokenizer, model, k)

    assert isinstance(res, torch.Tensor)
    assert res.shape == (k,)

@pytest.mark.parametrize("k", [5, 7])
def test_with_mock_objects(k):
    sequence = "Hello [MASK]"
    vocab_size = 1000

    data = {"input_ids": torch.tensor([[101, 555, 103, 102]])}
    be = BatchEncoding(data=data)

    logits = torch.rand(1, 4, vocab_size)

    tokenizer_m = Mock(spec=BertTokenizerFast,
                       return_value=be,
                       mask_token_id=103)
    model_m = Mock(spec=BertForMaskedLM)
    model_m.return_value.logits = logits

    res = get_top_k(sequence,
                    tokenizer_m,
                    model_m,
                    k=k)

    assert isinstance(res, torch.Tensor)
    assert res.shape == (k,)


================================================
FILE: mini_tutorials/numpy_equality_testing/test.py
================================================
import numpy as np
import pytest

def get_arrays():
    """Create 4 arrays that are all similar but different.

    Returns
    -------
    a : np.ndarray
        Reference array.

    a_eps : np.ndarray
        Same shape as `a`, however, the values are slightly different.

    a_dim : np.ndarray
        One extra dimension compared to `a`, however, the values are the same.

    a_nan : np.ndarray
        Same shape and same values, however, one entry is set to `np.nan`.
    """
    eps = 1e-5

    a = np.array([[1.2, 5.12, 2.4], [5.5, 8.8, 1.55]])
    a_eps = a + eps
    a_dim = a[None, :]  # shape (1, 2, 3)
    a_nan = a.copy()
    a_nan[0, 1] = np.nan

    return a, a_eps, a_dim, a_nan


def test___eq__():
    a, *_ = get_arrays()

    with pytest.raises(ValueError):
        assert a == a

def test___eq__all():
    a, a_eps, a_dim, a_nan = get_arrays()

    assert (a == a).all()
    assert not (a == a_eps).all()
    assert (a == a_dim).all()
    assert not (a_nan == a_nan).all()

def test_array_equal():
    a, a_eps, a_dim, a_nan = get_arrays()

    assert np.array_equal(a, a)
    assert not np.array_equal(a, a_eps)
    assert not np.array_equal(a, a_dim)
    assert not np.array_equal(a_nan, a_nan)
    assert np.array_equal(a_nan, a_nan, equal_nan=True)


def test_allclose():
    a, a_eps, a_dim, a_nan = get_arrays()

    atol = 1e-5

    assert np.allclose(a, a, atol=atol)
    assert np.allclose(a, a_eps, atol=atol)
    assert np.allclose(a, a_dim, atol=atol)
    assert not np.allclose(a_nan, a_nan, atol=atol)
    assert np.allclose(a_nan, a_nan, atol=atol, equal_nan=True)

def test_testing_array_equal():
    a, a_eps, a_dim, a_nan = get_arrays()

    np.testing.assert_array_equal(a, a)
    # np.testing.assert_array_equal(a, a_eps)
    # np.testing.assert_array_equal(a, a_dim)
    np.testing.assert_array_equal(a_nan, a_nan)

def test_testing_allclose():
    a, a_eps, a_dim, a_nan = get_arrays()

    atol = 1e-5

    np.testing.assert_allclose(a, a, atol=atol)
    np.testing.assert_allclose(a, a_eps, atol=atol)
    # np.testing.assert_allclose(a, a_dim, atol=atol)
    np.testing.assert_allclose(a_nan, a_nan, atol=atol)
    # np.testing.assert_allclose(a_nan, a_nan, atol=atol, equal_nan=False)


================================================
FILE: mini_tutorials/openai_function_calling/example.py
================================================
import json
import logging
import operator
import sys
import datetime
import openai
import yfinance as yf

TODAY = datetime.date.today().strftime("%Y/%m/%d")

logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(message)s")

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def get_price(symbol: str, date: str) -> float:
    logger.info(f"Calling get_price with {symbol=} and {date=}")

    history = yf.download(
        symbol, start=date, period="1d", interval="1d", progress=False
    )

    return history["Close"].iloc[0].item()


def calculate(a: float, b: float, op: str) -> float:
    logger.info(f"Calling calculate with {a=}, {b=} and {op=}")

    return getattr(operator, op)(a, b)


get_price_metadata = {
    "name": "get_price",
    "description": "Get closing price of a financial instrument on a given date",
    "parameters": {
        "type": "object",
        "properties": {
            "symbol": {
                "type": "string",
                "description": "Ticker symbol of a financial instrument",
            },
            "date": {
                "type": "string",
                "description": "Date in the format YYYY-MM-DD",
            },
        },
        "required": ["symbol", "date"],
    },
}

calculate_metadata = {
    "name": "calculate",
    "description": "General purpose calculator",
    "parameters": {
        "type": "object",
        "properties": {
            "a": {
                "type": "number",
                "description": "First entry",
            },
            "b": {
                "type": "number",
                "description": "Second entry",
            },
            "op": {
                "type": "string",
                "enum": ["mul", "add", "truediv", "sub"],
                "description": "Binary operation",
            },
        },
        "required": ["a", "b", "op"],
    },
}


messages = [
    {"role": "user", "content": sys.argv[1]},
    {
        "role": "system",
        "content": "You are a helpful financial investor who overlooks the "
        f"performance of stocks. Today is {TODAY}. Note that the "
        "format of the date is YYYY/MM/DD",
    },
]

while True:
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0613",
        temperature=0,
        messages=messages,
        functions=[get_price_metadata, calculate_metadata],
    )
    message = response["choices"][0]["message"]
    messages.append(message)

    if "function_call" not in message:
        break

    # call custom functions
    function_name = message["function_call"]["name"]
    kwargs = json.loads(message["function_call"]["arguments"])

    if function_name == "get_price":
        output = str(get_price(**kwargs))
    elif function_name == "calculate":
        output = str(calculate(**kwargs))
    else:
        raise ValueError

    messages.append({"role": "function", "name": function_name, "content": output})

print("*" * 80)
print([m["role"] for m in messages])
print("*" * 80)
print(messages[-1]["content"])


================================================
FILE: mini_tutorials/rag_with_reranking/README.md
================================================
# Description
## Installation

Run the following command to deploy a simple OpenSearch DB locally.
 
```bash
docker run -p 9200:9200 -p 9600:9600 -e "DISABLE_SECURITY_PLUGIN=true" -e "discovery.type=single-node" --name opensearch-node -d opensearchproject/opensearch:latest
```
The version of the image was `2.10.0` at the time of making the video.

To install the Python dependencies run
```bash
pip install opensearch-py cohere
```
Again, I did not hardcode any version, but the versions at the time of
making the video were

```bash
cohere==4.27
opensearch-py==2.3.1
```

## Contents
* `answer.py` - scripts that does RAG question answering - requires question as the only argument
* `input.txt` - each line corresponds to a document to be added to OpenSearch(except for emtpy lines and comments)
* `upload_data.py` - load `input.txt` into OpenSearch


Note that to use the `answer.py` you need to get a Cohere API token and
then export 
```bash
export COHERE_API_KEY=VERYSECRET
python answer.py 'What is the meaning of life?'
```

## Postman
You can import the `postman_collection.json` in Postman and then
simply add the following 3 variables in your environment

* `OpenSearchURL` - will be `http://localhost:9200` if you follow the above instructions
* `CohereURL` - should be `https://api.cohere.ai/v1`
* `CohereAPIKey` - you need to generate this yourself

# Diagrams

## RAG with embeddings
<img width="1165" alt="rag-with-embeddings" src="https://github.com/jankrepl/mildlyoverfitted/assets/18519371/678e69eb-96a9-4fa1-bcff-8c848d69f10a">

## RAG with reranking
<img width="1169" alt="rag-with-reranking" src="https://github.com/jankrepl/mildlyoverfitted/assets/18519371/45ea091b-5724-4117-bfec-d219afdd9f40">


================================================
FILE: mini_tutorials/rag_with_reranking/answer.py
================================================
import os
import sys


import cohere
from opensearchpy import OpenSearch

# Helper
def generate_prompt(question: str, contexts: str):
    prompt = (
        "Given the following extracted parts of a long document and a "
        'question, create a final answer with references ("SOURCES").'
        "If you don't know the answer, just say that you don't know, don't try "
        'to make up an answer. ALWAYS return a "SOURCES" part in your answer.\n'
    )

    prompt += f"QUESTION: {question}\n"
    prompt += "".join(
        [f"SOURCE {i}: {context}\n" for i, context in enumerate(contexts)]
    )
    prompt += "ANSWER: "

    return prompt


# PARAMETERS
INDEX_NAME = "cool_index"
FIELD_NAME = "stuff"
RETRIEVER_K = 5
RERANKER_K = 2
COHERE_API_KEY = os.environ["COHERE_API_KEY"]

question = sys.argv[1]

# Instantiate clients
os_client = OpenSearch(
    hosts=[
        {
            "host": "localhost",
            "port": 9200,
        }
    ]
)
cohere_client = cohere.Client(COHERE_API_KEY)

# Retrieve
os_results = os_client.search(
    body={
        "query": {
            "match": {
                FIELD_NAME: question
            }
        }
    },
    size=RETRIEVER_K
)
contexts = [x["_source"][FIELD_NAME] for x in os_results["hits"]["hits"]]
print("OpenSearch: ", contexts)

# Rerank
cohere_results = cohere_client.rerank(
    model="rerank-english-v2.0",
    query=question,
    documents=contexts,
    top_n=RERANKER_K,
)
reranked_contexts = [r.document["text"] for r in cohere_results]
print("Cohere Reranked: ", reranked_contexts)


# Chat completion
prompt = generate_prompt(question, reranked_contexts)

response = cohere_client.chat(
    chat_history=[],
    message=prompt
)

print("Answer: ", response.text)


================================================
FILE: mini_tutorials/rag_with_reranking/input.txt
================================================
# AGE AND FAVOURITE FOOD - 'What is the favourite food of Charles?', 'Who prefers vegetables the most?'
Adam is older than Ben
Ben is older then Charles
Adam eats a lot of carrots
Ben's favourite food is an apple
Charles loves KFC
Whatever, this sentence does not really contain anything super important

# SPORTING EVENTS - 'What country managed to become world football champion after 2050'?
Brazil won the Fifa World Cup in 2070
France is pretty good at football and won many championships
Finland has won many ice hockey world cups
Jamaica won the Athletics World Cup in 2055
Mexico won the Golf World Cup in 2050


================================================
FILE: mini_tutorials/rag_with_reranking/postman_collection.json
================================================
{
	"info": {
		"name": "Retrieval augmented generation",
		"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
	},
	"item": [
		{
			"name": "OpenSearch",
			"item": [
				{
					"name": "Get all indices",
					"request": {
						"method": "GET",
						"header": [],
						"url": {
							"raw": "{{OpenSearchURL}}/_cat/indices?v=true&s=index",
							"host": [
								"{{OpenSearchURL}}"
							],
							"path": [
								"_cat",
								"indices"
							],
							"query": [
								{
									"key": "v",
									"value": "true"
								},
								{
									"key": "s",
									"value": "index"
								}
							]
						}
					},
					"response": []
				},
				{
					"name": "Create index",
					"request": {
						"method": "PUT",
						"header": [],
						"body": {
							"mode": "raw",
							"raw": "{\n  \"settings\": {\n    \"index\": {\n      \"number_of_shards\": 1,\n      \"number_of_replicas\": 1\n    }\n  },\n  \"mappings\": {\n    \"properties\": {\n      \"stuff\": {\n        \"type\": \"text\"\n      }\n    }\n  }\n}",
							"options": {
								"raw": {
									"language": "json"
								}
							}
						},
						"url": {
							"raw": "{{OpenSearchURL}}/cool_index",
							"host": [
								"{{OpenSearchURL}}"
							],
							"path": [
								"cool_index"
							]
						}
					},
					"response": []
				},
				{
					"name": "Delete index",
					"request": {
						"method": "DELETE",
						"header": [],
						"body": {
							"mode": "raw",
							"raw": "",
							"options": {
								"raw": {
									"language": "json"
								}
							}
						},
						"url": {
							"raw": "{{OpenSearchURL}}/cool_index",
							"host": [
								"{{OpenSearchURL}}"
							],
							"path": [
								"cool_index"
							]
						}
					},
					"response": []
				},
				{
					"name": "Add document",
					"request": {
						"method": "POST",
						"header": [],
						"body": {
							"mode": "raw",
							"raw": "{\n  \"stuff\": \"This is just some document\"\n}",
							"options": {
								"raw": {
									"language": "json"
								}
							}
						},
						"url": {
							"raw": "{{OpenSearchURL}}/cool_index/_doc",
							"host": [
								"{{OpenSearchURL}}"
							],
							"path": [
								"cool_index",
								"_doc"
							]
						}
					},
					"response": []
				},
				{
					"name": "List all documents",
					"request": {
						"method": "POST",
						"header": [],
						"body": {
							"mode": "raw",
							"raw": "{\n    \"query\": {\n        \"match_all\": {}\n    }\n}",
							"options": {
								"raw": {
									"language": "json"
								}
							}
						},
						"url": {
							"raw": "{{OpenSearchURL}}/cool_index/_search",
							"host": [
								"{{OpenSearchURL}}"
							],
							"path": [
								"cool_index",
								"_search"
							]
						}
					},
					"response": []
				},
				{
					"name": "Lexical (BM 25) search",
					"request": {
						"method": "POST",
						"header": [],
						"body": {
							"mode": "raw",
							"raw": "{\n    \"query\": {\n        \"match\": {\n            \"stuff\": \"Some document\"\n        }\n    }\n}",
							"options": {
								"raw": {
									"language": "json"
								}
							}
						},
						"url": {
							"raw": "{{OpenSearchURL}}/cool_index/_search",
							"host": [
								"{{OpenSearchURL}}"
							],
							"path": [
								"cool_index",
								"_search"
							]
						}
					},
					"response": []
				}
			]
		},
		{
			"name": "Cohere",
			"item": [
				{
					"name": "Embed",
					"request": {
						"method": "POST",
						"header": [],
						"body": {
							"mode": "raw",
							"raw": "{\n  \"texts\": [\n    \"hello\",\n    \"goodbye\"\n  ],\n  \"truncate\": \"END\"\n}",
							"options": {
								"raw": {
									"language": "json"
								}
							}
						},
						"url": {
							"raw": "{{CohereURL}}/embed",
							"host": [
								"{{CohereURL}}"
							],
							"path": [
								"embed"
							]
						},
						"description": "[https://docs.cohere.com/reference/embed](https://docs.cohere.com/reference/embed)"
					},
					"response": []
				},
				{
					"name": "Rerank",
					"request": {
						"method": "POST",
						"header": [],
						"body": {
							"mode": "raw",
							"raw": "{\n  \"return_documents\": false,\n  \"max_chunks_per_doc\": 10,\n  \"query\": \"What is the capital of the United States?\",\n  \"documents\": [\n    \"Carson City is the capital city of the American state of Nevada.\",\n    \"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.\",\n    \"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.\",\n    \"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states.\"\n  ]\n}",
							"options": {
								"raw": {
									"language": "json"
								}
							}
						},
						"url": {
							"raw": "{{CohereURL}}/rerank",
							"host": [
								"{{CohereURL}}"
							],
							"path": [
								"rerank"
							]
						},
						"description": "[https://docs.cohere.com/reference/embed](https://docs.cohere.com/reference/embed)"
					},
					"response": []
				},
				{
					"name": "Chat",
					"request": {
						"method": "POST",
						"header": [],
						"body": {
							"mode": "raw",
							"raw": " {\n    \"chat_history\": [\n      {\"role\": \"USER\", \"message\": \"Who discovered gravity?\"},\n      {\"role\": \"CHATBOT\", \"message\": \"The man who is widely credited with discovering gravity is Sir Isaac Newton\"}\n    ],\n    \"message\": \"What year was he born?\"\n  }",
							"options": {
								"raw": {
									"language": "json"
								}
							}
						},
						"url": {
							"raw": "{{CohereURL}}/chat",
							"host": [
								"{{CohereURL}}"
							],
							"path": [
								"chat"
							]
						},
						"description": ""
					},
					"response": []
				}
			],
			"auth": {
				"type": "bearer",
				"bearer": [
					{
						"key": "token",
						"value": "{{CohereAPIKey}}",
						"type": "string"
					}
				]
			},
			"event": [
				{
					"listen": "prerequest",
					"script": {
						"type": "text/javascript",
						"exec": [
							""
						]
					}
				},
				{
					"listen": "test",
					"script": {
						"type": "text/javascript",
						"exec": [
							""
						]
					}
				}
			]
		}
	]
}


================================================
FILE: mini_tutorials/rag_with_reranking/upload_data.py
================================================
from pathlib import Path
from opensearchpy import OpenSearch

INPUT_FILE = "input.txt"
INDEX_NAME = "cool_index"
FIELD_NAME = "stuff"

client = OpenSearch(
    hosts=[
        {
            "host": "localhost",
            "port": 9200,
        }
    ]
)

print(client.ping())

with Path(INPUT_FILE).open() as f:
    i = 0
    for line in f.read().splitlines():
        if not line or line.startswith("#"):
            continue

        print(f"Adding {i}")
        client.index(index=INDEX_NAME, body={FIELD_NAME: line})
        i += 1


================================================
FILE: mini_tutorials/visualizing_activations_with_forward_hooks/src.py
================================================
import pathlib

import torch
import torch.nn.functional as F
from torch.nn import Linear, Module
from torch.utils.tensorboard import SummaryWriter

class Network(Module):
    def __init__(self):
        super().__init__()

        self.fc_1 = Linear(10, 20)
        self.fc_2 = Linear(20, 30)
        self.fc_3 = Linear(30, 2)


    def forward(self, x):
        x = self.fc_1(x)
        x = self.fc_2(x)
        x = self.fc_3(x)

        x = F.relu(x)

        return x

if __name__ == "__main__":
    log_dir = pathlib.Path.cwd() / "tensorboard_logs"
    writer = SummaryWriter(log_dir)

    x = torch.rand(1, 10)
    net = Network()

    def activation_hook(inst, inp, out):
        """Run activation hook.

        Parameters
        ----------
        inst : torch.nn.Module
            The layer we want to attach the hook to.
        inp : tuple of torch.Tensor
            The input to the `forward` method.
        out : torch.Tensor
            The output of the `forward` method.
        """
        print("Here")
        writer.add_histogram(repr(inst), out)

    handle_1 = net.fc_1.register_forward_hook(activation_hook)
    net.fc_2.register_forward_hook(activation_hook)
    net.fc_3.register_forward_hook(activation_hook)

    y = net(x)
    handle_1.remove()
    y = net(x)