Repository: jankrepl/mildlyoverfitted
Branch: master
Commit: 22f0ecc67cef
Files: 118
Total size: 314.6 KB
Directory structure:
gitextract_ixgqmhua/
├── .gitignore
├── LICENSE
├── README.md
├── github_adventures/
│ ├── automata/
│ │ ├── model.py
│ │ └── train.py
│ ├── diffaugment/
│ │ ├── README.MD
│ │ ├── script.py
│ │ └── utils.py
│ ├── dino/
│ │ ├── data/
│ │ │ ├── README.md
│ │ │ └── imagenette_labels.json
│ │ ├── evaluation.py
│ │ ├── train.py
│ │ ├── utils.py
│ │ ├── visualize_attentions.ipynb
│ │ └── visualize_augmentations.ipynb
│ ├── gpt/
│ │ ├── README.md
│ │ ├── copy_and_generate.py
│ │ ├── distribution_visualizations.ipynb
│ │ ├── ipython_code.py
│ │ ├── model.py
│ │ ├── requirements.txt
│ │ └── utils.py
│ ├── integer/
│ │ ├── README.md
│ │ ├── bert.py
│ │ ├── experiments.sh
│ │ ├── fetch_data.py
│ │ ├── glove.py
│ │ ├── lstm.py
│ │ ├── requirements.txt
│ │ └── utils.py
│ ├── lottery/
│ │ ├── README.md
│ │ ├── data.py
│ │ ├── main.py
│ │ ├── parallel_launch.sh
│ │ ├── requirements.txt
│ │ └── utils.py
│ ├── mixer/
│ │ ├── README.md
│ │ ├── official.py
│ │ ├── ours.py
│ │ └── test_compare.py
│ ├── mixup/
│ │ ├── launch_experiments.sh
│ │ ├── train.py
│ │ └── utils.py
│ ├── ner_evaluation/
│ │ ├── README.md
│ │ ├── ours.py
│ │ ├── test_ours.py
│ │ └── try.py
│ ├── neuron/
│ │ ├── README.md
│ │ ├── evaluate_noise.py
│ │ ├── evaluate_shuffling.py
│ │ ├── evaluate_video.py
│ │ ├── launch.sh
│ │ ├── pretrained/
│ │ │ ├── MLP.pkl
│ │ │ ├── MLP_augment.pkl
│ │ │ ├── invariant_official.pkl
│ │ │ ├── invariant_ours.pkl
│ │ │ ├── linear.pkl
│ │ │ └── linear_augment.pkl
│ │ ├── requirements.txt
│ │ ├── solutions.py
│ │ ├── tasks.py
│ │ ├── torch_utils.py
│ │ └── trainer.py
│ ├── pondernet/
│ │ ├── experiment_1.sh
│ │ ├── experiment_2.sh
│ │ ├── requirements.txt
│ │ ├── train.py
│ │ └── utils.py
│ ├── product_quantization/
│ │ ├── README.md
│ │ ├── convert.py
│ │ ├── custom.py
│ │ ├── faiss_101_ipython.py
│ │ ├── generate_index.py
│ │ ├── parse.py
│ │ ├── requirements.txt
│ │ ├── run_all.sh
│ │ └── run_gradio.py
│ ├── siren/
│ │ ├── activations.py
│ │ ├── core.py
│ │ └── train.py
│ └── vision_transformer/
│ ├── classes.txt
│ ├── custom.py
│ ├── forward.py
│ └── verify.py
└── mini_tutorials/
├── bentoml/
│ ├── README.md
│ ├── bentofile.yaml
│ ├── create_model.py
│ ├── requirements.txt
│ └── service.py
├── custom_optimizer_in_pytorch/
│ ├── custom.py
│ └── src.py
├── deploying_on_kubernetes/
│ ├── Dockerfile
│ ├── DockerfileConda
│ └── README.md
├── embedding/
│ ├── README.md
│ ├── Visualize.ipynb
│ └── src.py
├── fewshot_text_classification/
│ ├── classify.py
│ └── template.jinja2
├── gradient_wrt_input/
│ ├── explain.py
│ ├── fool.py
│ └── utils.py
├── haiku_basics/
│ ├── buffers_in_torch.py
│ ├── parameter.py
│ ├── reallife.py
│ ├── requirements.txt
│ └── state.py
├── httpx_rate_limiting/
│ └── script.py
├── mocking_neural_networks/
│ ├── app.py
│ └── test.py
├── numpy_equality_testing/
│ └── test.py
├── openai_function_calling/
│ └── example.py
├── rag_with_reranking/
│ ├── README.md
│ ├── answer.py
│ ├── input.txt
│ ├── postman_collection.json
│ └── upload_data.py
└── visualizing_activations_with_forward_hooks/
└── src.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2024 Jan Krepl
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# mildlyoverfitted
Code for https://www.youtube.com/c/mildlyoverfitted.
### Overview
| Name | Video | Code |
|--------------------------------------------------------------------------------|--------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
| Asynchronous requests and rate limiting | [link](https://youtu.be/luWsr9exlE4) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/httpx_rate_limiting) |
| BentoML Sagemaker deployment | [link](https://youtu.be/Zci_D4az9FU) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/bentoml) |
| Custom optimizer in PyTorch | [link](https://youtu.be/zvp8K4iX2Cs) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/custom_optimizer_in_pytorch) |
| Deploying machine learning models on Kubernetes | [link](https://youtu.be/DQRNt8Diyw4) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/deploying_on_kubernetes) |
| Differentiable augmentation for GANs (using Kornia) | [link](https://youtu.be/J97EM3Clyys) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/diffaugment) |
| DINO in PyTorch | [link](https://youtu.be/psmMEWKk4Uk) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/dino) |
| Few-shot text classification with prompts | [link](https://youtu.be/AhqgDXcBU2M) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/fewshot_text_classification) |
| GPT in PyTorch | [link](https://youtu.be/d7IRM40VMYM) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/gpt) |
| Gradient with respect to input in PyTorch (FGSM attack + Integrated Gradients) | [link](https://youtu.be/5lFiZTSsp40) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/gradient_wrt_input) |
| Growing neural cellular automata in PyTorch | [link](https://youtu.be/21ACbWoF2Oo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/automata) |
| Haiku basics | [link](https://youtu.be/yXCKS-ZoYTY) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/haiku_basics) |
| Integer embeddings in PyTorch | [link](https://youtu.be/bybuSBVzOdg) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/integer) |
| Mixup in PyTorch | [link](https://youtu.be/hGAKHKqmXdY) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/mixup) |
| MLP-Mixer in Flax and PyTorch | [link](https://youtu.be/HqytB2GUbHA) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/mixer) |
| Mocking neural networks: unit testing in deep learning | [link](https://youtu.be/_KVV9jXSzvo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/mocking_neural_networks) |
| NER model evaluation | [link](https://youtu.be/70YAUYP3hrw) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/ner_evaluation) |
| NumPy equality testing | [link](https://youtu.be/sai1g5fjyb8) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/numpy_equality_testing) |
| OpenAI function calling | [link](https://youtu.be/_B7F_6nTVEg) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/openai_function_calling) |
| PonderNet in PyTorch | [link](https://youtu.be/JLFz1dU5HR4) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/pondernet) |
| Product quantization in Faiss and from scratch | [link](https://youtu.be/PNVJvZEkuXo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/product_quantization) |
| Retrieval augmented generation with OpenSearch and reranking | [link](https://youtu.be/OsE7YcDcPz0) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/rag_with_reranking) |
| SIREN in PyTorch | [link](https://youtu.be/s4iFEoNlYhM) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/siren) |
| The Lottery Ticket Hypothesis and pruning in PyTorch | [link](https://youtu.be/bQt0CLXXAqg) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/lottery) |
| The Sensory Neuron as a Transformer in PyTorch | [link](https://youtu.be/mi_mzlhBGAU) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/neuron) |
| `torch.nn.Embedding` explained (+ Character-level language model) | [link](https://youtu.be/euwN5DHfLEo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/embedding) |
| Vision Transformer in PyTorch | [link](https://youtu.be/ovB0ddFtzzA) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/vision_transformer) |
| Visualizing activations with forward hooks (PyTorch) | [link](https://youtu.be/1ZbLA7ofasY) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/visualizing_activations_with_forward_hooks) |
================================================
FILE: github_adventures/automata/model.py
================================================
import torch
import torch.nn as nn
class CAModel(nn.Module):
"""Cell automata model.
Parameters
----------
n_channels : int
Number of channels of the grid.
hidden_channels : int
Hidden channels that are related to the pixelwise 1x1 convolution.
fire_rate : float
Number between 0 and 1. The lower it is the more likely it is for
cells to be set to zero during the `stochastic_update` process.
device : torch.device
Determines on what device we perfrom all the computations.
Attributes
----------
update_module : nn.Sequential
The only part of the network containing trainable parameters. Composed
of 1x1 convolution, ReLu and 1x1 convolution.
filters : torch.Tensor
Constant tensor of shape `(3 * n_channels, 1, 3, 3)`.
"""
def __init__(self, n_channels=16, hidden_channels=128, fire_rate=0.5, device=None):
super().__init__()
self.fire_rate = 0.5
self.n_channels = n_channels
self.device = device or torch.device("cpu")
# Perceive step
sobel_filter_ = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]])
scalar = 8.0
sobel_filter_x = sobel_filter_ / scalar
sobel_filter_y = sobel_filter_.t() / scalar
identity_filter = torch.tensor(
[
[0, 0, 0],
[0, 1, 0],
[0, 0, 0],
],
dtype=torch.float32,
)
filters = torch.stack(
[identity_filter, sobel_filter_x, sobel_filter_y]
) # (3, 3, 3)
filters = filters.repeat((n_channels, 1, 1)) # (3 * n_channels, 3, 3)
self.filters = filters[:, None, ...].to(
self.device
) # (3 * n_channels, 1, 3, 3)
# Update step
self.update_module = nn.Sequential(
nn.Conv2d(
3 * n_channels,
hidden_channels,
kernel_size=1, # (1, 1)
),
nn.ReLU(),
nn.Conv2d(
hidden_channels,
n_channels,
kernel_size=1,
bias=False,
),
)
with torch.no_grad():
self.update_module[2].weight.zero_()
self.to(self.device)
def perceive(self, x):
"""Approximate channelwise gradient and combine with the input.
This is the only place where we include information on the
neighboring cells. However, we are not using any learnable
parameters here.
Parameters
----------
x : torch.Tensor
Shape `(n_samples, n_channels, grid_size, grid_size)`.
Returns
-------
torch.Tensor
Shape `(n_samples, 3 * n_channels, grid_size, grid_size)`.
"""
return nn.functional.conv2d(x, self.filters, padding=1, groups=self.n_channels)
def update(self, x):
"""Perform update.
Note that this is the only part of the forward pass that uses
trainable parameters
Paramters
---------
x : torch.Tensor
Shape `(n_samples, 3 * n_channels, grid_size, grid_size)`.
Returns
-------
torch.Tensor
Shape `(n_samples, n_channels, grid_size, grid_size)`.
"""
return self.update_module(x)
@staticmethod
def stochastic_update(x, fire_rate):
"""Run pixel-wise dropout.
Unlike dropout there is no scaling taking place.
Parameters
----------
x : torch.Tensor
Shape `(n_samples, n_channels, grid_size, grid_size)`.
fire_rate : float
Number between 0 and 1. The higher the more likely a given cell
updates.
Returns
-------
torch.Tensor
Shape `(n_samples, n_channels, grid_size, grid_size)`.
"""
device = x.device
mask = (torch.rand(x[:, :1, :, :].shape) <= fire_rate).to(device, torch.float32)
return x * mask # broadcasted over all channels
@staticmethod
def get_living_mask(x):
"""Identify living cells.
Parameters
----------
x : torch.Tensor
Shape `(n_samples, n_channels, grid_size, grid_size)`.
Returns
-------
torch.Tensor
Shape `(n_samples, 1, grid_size, grid_size)` and the
dtype is bool.
"""
return (
nn.functional.max_pool2d(
x[:, 3:4, :, :], kernel_size=3, stride=1, padding=1
)
> 0.1
)
def forward(self, x):
"""Run the forward pass.
Parameters
----------
x : torch.Tensor
Shape `(n_samples, n_channels, grid_size, grid_size)`.
Returns
-------
torch.Tensor
Shape `(n_sample, n_channels, grid_size, grid_size)`.
"""
pre_life_mask = self.get_living_mask(x)
y = self.perceive(x)
dx = self.update(y)
dx = self.stochastic_update(dx, fire_rate=self.fire_rate)
x = x + dx
post_life_mask = self.get_living_mask(x)
life_mask = (pre_life_mask & post_life_mask).to(torch.float32)
return x * life_mask
================================================
FILE: github_adventures/automata/train.py
================================================
import argparse
import pathlib
import numpy as np
import torch
import torch.nn as nn
from PIL import Image
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from model import CAModel
def load_image(path, size=40):
"""Load an image.
Parameters
----------
path : pathlib.Path
Path to where the image is located. Note that the image needs to be
RGBA.
size : int
The image will be resized to a square wit ha side length of `size`.
Returns
-------
torch.Tensor
4D float image of shape `(1, 4, size, size)`. The RGB channels
are premultiplied by the alpha channel.
"""
img = Image.open(path)
img = img.resize((size, size), Image.ANTIALIAS)
img = np.float32(img) / 255.0
img[..., :3] *= img[..., 3:]
return torch.from_numpy(img).permute(2, 0, 1)[None, ...]
def to_rgb(img_rgba):
"""Convert RGBA image to RGB image.
Parameters
----------
img_rgba : torch.Tensor
4D tensor of shape `(1, 4, size, size)` where the RGB channels
were already multiplied by the alpha.
Returns
-------
img_rgb : torch.Tensor
4D tensor of shape `(1, 3, size, size)`.
"""
rgb, a = img_rgba[:, :3, ...], torch.clamp(img_rgba[:, 3:, ...], 0, 1)
return torch.clamp(1.0 - a + rgb, 0, 1)
def make_seed(size, n_channels):
"""Create a starting tensor for training.
The only active pixels are going to be in the middle.
Parameters
----------
size : int
The height and the width of the tensor.
n_channels : int
Overall number of channels. Note that it needs to be higher than 4
since the first 4 channels represent RGBA.
Returns
-------
torch.Tensor
4D float tensor of shape `(1, n_chanels, size, size)`.
"""
x = torch.zeros((1, n_channels, size, size), dtype=torch.float32)
x[:, 3:, size // 2, size // 2] = 1
return x
def main(argv=None):
parser = argparse.ArgumentParser(
description="Training script for the Celluar Automata"
)
parser.add_argument("img", type=str, help="Path to the image we want to reproduce")
parser.add_argument(
"-b",
"--batch-size",
type=int,
default=8,
help="Batch size. Samples will always be taken randomly from the pool."
)
parser.add_argument(
"-d",
"--device",
type=str,
default="cpu",
help="Device to use",
choices=("cpu", "cuda"),
)
parser.add_argument(
"-e",
"--eval-frequency",
type=int,
default=500,
help="Evaluation frequency.",
)
parser.add_argument(
"-i",
"--eval-iterations",
type=int,
default=300,
help="Number of iterations when evaluating.",
)
parser.add_argument(
"-n",
"--n-batches",
type=int,
default=5000,
help="Number of batches to train for.",
)
parser.add_argument(
"-c",
"--n-channels",
type=int,
default=16,
help="Number of channels of the input tensor",
)
parser.add_argument(
"-l",
"--logdir",
type=str,
default="logs",
help="Folder where all the logs and outputs are saved.",
)
parser.add_argument(
"-p",
"--padding",
type=int,
default=16,
help="Padding. The shape after padding is (h + 2 * p, w + 2 * p).",
)
parser.add_argument(
"--pool-size",
type=int,
default=1024,
help="Size of the training pool",
)
parser.add_argument(
"-s",
"--size",
type=int,
default=40,
help="Image size",
)
# Parse arguments
args = parser.parse_args()
print(vars(args))
# Misc
device = torch.device(args.device)
log_path = pathlib.Path(args.logdir)
log_path.mkdir(parents=True, exist_ok=True)
writer = SummaryWriter(log_path)
# Target image
target_img_ = load_image(args.img, size=args.size)
p = args.padding
target_img_ = nn.functional.pad(target_img_, (p, p, p, p), "constant", 0)
target_img = target_img_.to(device)
target_img = target_img.repeat(args.batch_size, 1, 1, 1)
writer.add_image("ground truth", to_rgb(target_img_)[0])
# Model and optimizer
model = CAModel(n_channels=args.n_channels, device=device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)
# Pool initialization
seed = make_seed(args.size, args.n_channels).to(device)
seed = nn.functional.pad(seed, (p, p, p, p), "constant", 0)
pool = seed.clone().repeat(args.pool_size, 1, 1, 1)
for it in tqdm(range(args.n_batches)):
batch_ixs = np.random.choice(
args.pool_size, args.batch_size, replace=False
).tolist()
x = pool[batch_ixs]
for i in range(np.random.randint(64, 96)):
x = model(x)
loss_batch = ((target_img - x[:, :4, ...]) ** 2).mean(dim=[1, 2, 3])
loss = loss_batch.mean()
optimizer.zero_grad()
loss.backward()
optimizer.step()
writer.add_scalar("train/loss", loss, it)
argmax_batch = loss_batch.argmax().item()
argmax_pool = batch_ixs[argmax_batch]
remaining_batch = [i for i in range(args.batch_size) if i != argmax_batch]
remaining_pool = [i for i in batch_ixs if i != argmax_pool]
pool[argmax_pool] = seed.clone()
pool[remaining_pool] = x[remaining_batch].detach()
if it % args.eval_frequency == 0:
x_eval = seed.clone() # (1, n_channels, size, size)
eval_video = torch.empty(1, args.eval_iterations, 3, *x_eval.shape[2:])
for it_eval in range(args.eval_iterations):
x_eval = model(x_eval)
x_eval_out = to_rgb(x_eval[:, :4].detach().cpu())
eval_video[0, it_eval] = x_eval_out
writer.add_video("eval", eval_video, it, fps=60)
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/diffaugment/README.MD
================================================
# Data
https://hanlab.mit.edu/projects/data-efficient-gans/datasets/100-shot-grumpy_cat.zip
Just unzip it into `data/` and the code should work out of the box.
================================================
FILE: github_adventures/diffaugment/script.py
================================================
import argparse
import pathlib
import pprint
from datetime import datetime
import kornia.augmentation as K
import torch
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision.utils import make_grid
from tqdm import tqdm
from utils import DatasetImages, Discriminator, Generator, init_weights_
def main(argv=None):
# CLI
parser = argparse.ArgumentParser()
parser.add_argument("name", help="Name of the experiment")
parser.add_argument(
"-a",
"--augment",
action="store_true",
help="If True, we apply augmentations",
)
parser.add_argument(
"-b", "--batch-size", type=int, default=16, help="Batch size"
)
parser.add_argument(
"--b1",
type=float,
default=0.5,
help="Adam optimizer hyperparamter",
)
parser.add_argument(
"--b2",
type=float,
default=0.999,
help="Adam optimizer hyperparamter",
)
parser.add_argument(
"-d",
"--device",
type=str,
default="cpu",
choices=["cpu", "cuda"],
help="Device to use",
)
parser.add_argument(
"--eval-frequency",
type=int,
default=400,
help="Generate generator images every `eval_frequency` epochs",
)
parser.add_argument(
"--latent-dim",
type=int,
default=100,
help="Dimensionality of the random noise",
)
parser.add_argument(
"--lr", type=float, default=0.0002, help="Learning rate"
)
parser.add_argument(
"--ndf",
type=int,
default=32,
help="Number of discriminator feature maps (after first convolution)",
)
parser.add_argument(
"--ngf",
type=int,
default=32,
help="Number of generator feature maps (before last transposed convolution)",
)
parser.add_argument(
"-n",
"--n-epochs",
type=int,
default=200,
help="Number of training epochs",
)
parser.add_argument(
"--mosaic-size",
type=int,
default=10,
help="Size of the side of the rectangular mosaic",
)
parser.add_argument(
"-p",
"--prob",
type=float,
default=0.9,
help="Probability of applying an augmentation",
)
args = parser.parse_args(argv)
args_d = vars(args)
print(args)
img_size = 128
# Additional parameters
device = torch.device(args.device)
mosaic_kwargs = {"nrow": args.mosaic_size, "normalize": True}
n_mosaic_cells = args.mosaic_size * args.mosaic_size
sample_showcase_ix = (
0 # this one will be used to demonstrate the augmentations
)
augment_module = torch.nn.Sequential(
K.RandomAffine(degrees=0, translate=(1 / 8, 1 / 8), p=args.prob),
K.RandomErasing((0.0, 0.5), p=args.prob),
)
# Loss function
adversarial_loss = torch.nn.BCELoss()
# Initialize generator and discriminator
generator = Generator(latent_dim=args.latent_dim, ngf=args.ngf)
discriminator = Discriminator(
ndf=args.ndf, augment_module=augment_module if args.augment else None
)
generator.to(device)
discriminator.to(device)
# Initialize weights
generator.apply(init_weights_)
discriminator.apply(init_weights_)
# Configure data loader
data_path = pathlib.Path("data")
tform = transforms.Compose(
[
transforms.Resize(img_size),
transforms.ToTensor(),
transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
]
)
dataset = DatasetImages(
data_path,
transform=tform,
)
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
shuffle=True,
)
# Optimizers
optimizer_G = torch.optim.Adam(
generator.parameters(), lr=args.lr, betas=(args.b1, args.b2)
)
optimizer_D = torch.optim.Adam(
discriminator.parameters(), lr=args.lr, betas=(args.b1, args.b2)
)
# Output path and metadata
output_path = pathlib.Path("outputs") / args.name
output_path.mkdir(exist_ok=True, parents=True)
# Add other parameters (not included in CLI)
args_d["time"] = datetime.now()
args_d["kornia"] = str(augment_module)
# Prepare tensorboard writer
writer = SummaryWriter(output_path)
# Log hyperparameters as text
writer.add_text(
"hyperparameter",
pprint.pformat(args_d).replace(
"\n", " \n"
), # markdown needs 2 spaces before newline
0,
)
# Log true data
writer.add_image(
"true_data",
make_grid(
torch.stack([dataset[i] for i in range(n_mosaic_cells)]),
**mosaic_kwargs
),
0,
)
# Log augmented data
batch_showcase = dataset[sample_showcase_ix][None, ...].repeat(
n_mosaic_cells, 1, 1, 1
)
batch_showcase_aug = discriminator.augment_module(batch_showcase)
writer.add_image(
"augmentations", make_grid(batch_showcase_aug, **mosaic_kwargs), 0
)
# Prepate evaluation noise
z_eval = torch.randn(n_mosaic_cells, args.latent_dim).to(device)
for epoch in tqdm(range(args.n_epochs)):
for i, imgs in enumerate(dataloader):
n_samples, *_ = imgs.shape
batches_done = epoch * len(dataloader) + i
# Adversarial ground truths
valid = 0.9 * torch.ones(
n_samples, 1, device=device, dtype=torch.float32
)
fake = torch.zeros(n_samples, 1, device=device, dtype=torch.float32)
# D preparation
optimizer_D.zero_grad()
# D loss on reals
real_imgs = imgs.to(device)
d_x = discriminator(real_imgs)
real_loss = adversarial_loss(d_x, valid)
real_loss.backward()
# D loss on fakes
z = torch.randn(n_samples, args.latent_dim).to(device)
gen_imgs = generator(z)
d_g_z1 = discriminator(gen_imgs.detach())
fake_loss = adversarial_loss(d_g_z1, fake)
fake_loss.backward()
optimizer_D.step() # we called backward twice, the result is a sum
# G preparation
optimizer_G.zero_grad()
# G loss
d_g_z2 = discriminator(gen_imgs)
g_loss = adversarial_loss(d_g_z2, valid)
g_loss.backward()
optimizer_G.step()
# Logging
if batches_done % 50 == 0:
writer.add_scalar("d_x", d_x.mean().item(), batches_done)
writer.add_scalar("d_g_z1", d_g_z1.mean().item(), batches_done)
writer.add_scalar("d_g_z2", d_g_z2.mean().item(), batches_done)
writer.add_scalar(
"D_loss", (real_loss + fake_loss).item(), batches_done
)
writer.add_scalar("G_loss", g_loss.item(), batches_done)
if epoch % args.eval_frequency == 0 and i == 0:
generator.eval()
discriminator.eval()
# Generate fake images
gen_imgs_eval = generator(z_eval)
# Generate nice mosaic
writer.add_image(
"fake",
make_grid(gen_imgs_eval.data, **mosaic_kwargs),
batches_done,
)
# Save checkpoint (and potentially overwrite an existing one)
torch.save(generator, output_path / "model.pt")
# Make sure generator and discriminator in the training mode
generator.train()
discriminator.train()
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/diffaugment/utils.py
================================================
import torch.nn as nn
from PIL import Image
from torch.utils.data import Dataset
class DatasetImages(Dataset):
"""Dataset loading photos on the hard drive.
Parameters
----------
path : pathlib.Path
Path to the folder containing all the images.
transform : None or callable
The transform to be applied when yielding the image.
Attributes
----------
all_paths : list
List of all paths to the `.jpg` images.
"""
def __init__(self, path, transform=None):
super().__init__()
self.all_paths = sorted([p for p in path.iterdir() if p.suffix == ".jpg"])
self.transform = transform
def __len__(self):
"""Compute length of the dataset."""
return len(self.all_paths)
def __getitem__(self, ix):
"""Get a single item."""
img = Image.open(self.all_paths[ix])
if self.transform is not None:
img = self.transform(img)
return img
class Generator(nn.Module):
"""Generator network.
Parameters
----------
latent_dim : int
The dimensionality of the input noise.
ngf : int
Number of generator filters. Note that the actual number of filters
will be a multiple of this number and is going to be divided by two in
each consecutive block of the network.
Attributes
----------
main : torch.Sequential
The actual network that is composed of `ConvTranspose2d`, `BatchNorm2d`
and `ReLU` blocks.
"""
def __init__(self, latent_dim, ngf=64):
super().__init__()
self.main = nn.Sequential(
nn.ConvTranspose2d(latent_dim, ngf * 16, 4, 1, 0, bias=False),
nn.BatchNorm2d(ngf * 16),
nn.ReLU(True),
# (ngf * 16) x 4 x 4
nn.ConvTranspose2d(ngf * 16, ngf * 8, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 8),
nn.ReLU(True),
# (ngf * 8) x 8 x 8
nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 4),
nn.ReLU(True),
# (ngf * 4) x 16 x 16
nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf * 2),
nn.ReLU(True),
# (ngf * 2) x 32 x 32
nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False),
nn.BatchNorm2d(ngf),
nn.ReLU(True),
# ngf x 64 x 64
nn.ConvTranspose2d(ngf, 3, 4, 2, 1, bias=False),
nn.Tanh(),
# 3 x 128 x 128
)
def forward(self, x):
"""Run the forward pass.
Parameters
----------
x : torch.Tensor
Input noise of shape `(n_samples, latent_dim)`.
Returns
-------
torch.Tensor
Generated images of shape `(n_samples, 3, 128, 128)`.
"""
x = x.reshape(*x.shape, 1, 1) # (n_samples, latent_dim, 1, 1)
return self.main(x)
class Discriminator(nn.Module):
"""Discriminator netowrk.
Parameters
----------
ndf : int
Number of discriminator filters. It represents the number of filters
after the first convolution block. Each consecutive block will double
the number.
augment_module : nn.Module or None
If provided it represents the Kornia module that performs
differentiable augmentation of the images.
Attributes
----------
augment_module : nn.Module
If the input parameter `augment_module` provided then this is the
same thing. If not, then this is just an identity mapping.
"""
def __init__(self, ndf=16, augment_module=None):
super().__init__()
self.main = nn.Sequential(
# 3 x 128 x 128
nn.Conv2d(3, ndf, 4, stride=2, padding=1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
# ndf x 64 x 64
nn.Conv2d(ndf, ndf * 2, 4, stride=2, padding=1, bias=False),
nn.BatchNorm2d(ndf * 2),
nn.LeakyReLU(0.2, inplace=True),
# (ndf * 2) x 32 x 32
nn.Conv2d(ndf * 2, ndf * 4, 4, stride=2, padding=1, bias=False),
nn.BatchNorm2d(ndf * 4),
nn.LeakyReLU(0.2, inplace=True),
# (ndf * 4) x 16 x 16
nn.Conv2d(ndf * 4, ndf * 8, 4, stride=2, padding=1, bias=False),
nn.BatchNorm2d(ndf * 8),
nn.LeakyReLU(0.2, inplace=True),
# (ndf * 8) x 8 x 8
nn.Conv2d(ndf * 8, ndf * 16, 4, stride=2, padding=1, bias=False),
nn.BatchNorm2d(ndf * 16),
nn.LeakyReLU(0.2, inplace=True),
# (ndf * 16) x 4 x 4
nn.Conv2d(ndf * 16, 1, 4, stride=1, padding=0, bias=False),
nn.Sigmoid()
# 1 x 1 x 1
)
if augment_module is not None:
self.augment_module = augment_module
else:
self.augment_module = nn.Identity()
def forward(self, x):
"""Run the forward pass.
Parameters
----------
x : torch.Tensor
Input images of shape `(n_samples, 3, 128, 128)`.
Returns
-------
torch.Tensor
Classification outputs of shape `(n_samples, 1)`.
"""
if self.training:
x = self.augment_module(x)
x = self.main(x) # (n_samples, 1, 1, 1)
x = x.reshape(len(x), -1) # (n_samples, 1)
return x
def init_weights_(module):
"""Initialize weights by sampling from a normal distribution.
Note that this operation is modifying the weights in place.
Parameters
----------
module : nn.Module
Module with trainable weights.
"""
cls_name = module.__class__.__name__
if cls_name in {"Conv2d", "ConvTranspose2d"}:
nn.init.normal_(module.weight.data, 0.0, 0.02)
elif cls_name == "BatchNorm2d":
nn.init.normal_(module.weight.data, 1.0, 0.02)
nn.init.constant_(module.bias.data, 0.0)
================================================
FILE: github_adventures/dino/data/README.md
================================================
The `Imagenette` dataset was used. You can find it here: https://github.com/fastai/imagenette (320 px version).
================================================
FILE: github_adventures/dino/data/imagenette_labels.json
================================================
{"n01440764": "tench", "n02102040": "english_springer", "n02979186": "cassette_player", "n03000684": "chain_saw", "n03028079": "church", "n03394916": "french_horn", "n03417042": "garbage_truck", "n03425413": "gas_pump", "n03445777": "golf_ball", "n03888257": "parachute"}
================================================
FILE: github_adventures/dino/evaluation.py
================================================
import numpy as np
import torch
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
def compute_knn(backbone, data_loader_train, data_loader_val):
"""Get CLS embeddings and use KNN classifier on them.
We load all embeddings in memory and use sklearn. Should
be doable.
Parameters
----------
backbone : timm.models.vision_transformer.VisionTransformer
Vision transformer whose head is just an identity
mapping.
data_loader_train, data_loader_val : torch.utils.data.DataLoader
Training and validation dataloader that does not apply any
augmentations. Just casting to tensor and then normalizing.
Returns
-------
val_accuracy : float
Validation accuracy.
"""
device = next(backbone.parameters()).device
data_loaders = {
"train": data_loader_train,
"val": data_loader_val,
}
lists = {
"X_train": [],
"y_train": [],
"X_val": [],
"y_val": [],
}
for name, data_loader in data_loaders.items():
for imgs, y in data_loader:
imgs = imgs.to(device)
lists[f"X_{name}"].append(backbone(imgs).detach().cpu().numpy())
lists[f"y_{name}"].append(y.detach().cpu().numpy())
arrays = {k: np.concatenate(l) for k, l in lists.items()}
estimator = KNeighborsClassifier()
estimator.fit(arrays["X_train"], arrays["y_train"])
y_val_pred = estimator.predict(arrays["X_val"])
acc = accuracy_score(arrays["y_val"], y_val_pred)
return acc
def compute_embedding(backbone, data_loader):
"""Compute CLS embedding and prepare for TensorBoard.
Parameters
----------
backbone : timm.models.vision_transformer.VisionTransformer
Vision transformer. The head should be an identity mapping.
data_loader : torch.utils.data.DataLoader
Validation dataloader that does not apply any augmentations. Just
casting to tensor and then normalizing.
Returns
-------
embs : torch.Tensor
Embeddings of shape `(n_samples, out_dim)`.
imgs : torch.Tensor
Images of shape `(n_samples, 3, height, width)`.
labels : list
List of strings representing the classes.
"""
device = next(backbone.parameters()).device
embs_l = []
imgs_l = []
labels = []
for img, y in data_loader:
img = img.to(device)
embs_l.append(backbone(img).detach().cpu())
imgs_l.append(((img * 0.224) + 0.45).cpu()) # undo norm
labels.extend([data_loader.dataset.classes[i] for i in y.tolist()])
embs = torch.cat(embs_l, dim=0)
imgs = torch.cat(imgs_l, dim=0)
return embs, imgs, labels
================================================
FILE: github_adventures/dino/train.py
================================================
import argparse
import json
import pathlib
import timm
import torch
import torchvision.transforms as transforms
import tqdm
from torch.utils.data import DataLoader, SubsetRandomSampler
from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets import ImageFolder
from evaluation import compute_embedding, compute_knn
from utils import DataAugmentation, Head, Loss, MultiCropWrapper, clip_gradients
def main():
parser = argparse.ArgumentParser(
"DINO training CLI",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument("-b", "--batch-size", type=int, default=32)
parser.add_argument(
"-d", "--device", type=str, choices=("cpu", "cuda"), default="cpu"
)
parser.add_argument("-l", "--logging-freq", type=int, default=200)
parser.add_argument("--momentum-teacher", type=int, default=0.9995)
parser.add_argument("-c", "--n-crops", type=int, default=4)
parser.add_argument("-e", "--n-epochs", type=int, default=100)
parser.add_argument("-o", "--out-dim", type=int, default=1024)
parser.add_argument("-t", "--tensorboard-dir", type=str, default="logs")
parser.add_argument("--clip-grad", type=float, default=2.0)
parser.add_argument("--norm-last-layer", action="store_true")
parser.add_argument("--batch-size-eval", type=int, default=64)
parser.add_argument("--teacher-temp", type=float, default=0.04)
parser.add_argument("--student-temp", type=float, default=0.1)
parser.add_argument("--pretrained", action="store_true")
parser.add_argument("-w", "--weight-decay", type=float, default=0.4)
args = parser.parse_args()
print(vars(args))
# Parameters
vit_name, dim = "vit_deit_small_patch16_224", 384
path_dataset_train = pathlib.Path("data/imagenette2-320/train")
path_dataset_val = pathlib.Path("data/imagenette2-320/val")
path_labels = pathlib.Path("data/imagenette_labels.json")
logging_path = pathlib.Path(args.tensorboard_dir)
device = torch.device(args.device)
n_workers = 4
# Data related
with path_labels.open("r") as f:
label_mapping = json.load(f)
transform_aug = DataAugmentation(size=224, n_local_crops=args.n_crops - 2)
transform_plain = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
transforms.Resize((224, 224)),
]
)
dataset_train_aug = ImageFolder(path_dataset_train, transform=transform_aug)
dataset_train_plain = ImageFolder(path_dataset_train, transform=transform_plain)
dataset_val_plain = ImageFolder(path_dataset_val, transform=transform_plain)
if dataset_train_plain.classes != dataset_val_plain.classes:
raise ValueError("Inconsistent classes")
data_loader_train_aug = DataLoader(
dataset_train_aug,
batch_size=args.batch_size,
shuffle=True,
drop_last=True,
num_workers=n_workers,
pin_memory=True,
)
data_loader_train_plain = DataLoader(
dataset_train_plain,
batch_size=args.batch_size_eval,
drop_last=False,
num_workers=n_workers,
)
data_loader_val_plain = DataLoader(
dataset_val_plain,
batch_size=args.batch_size_eval,
drop_last=False,
num_workers=n_workers,
)
data_loader_val_plain_subset = DataLoader(
dataset_val_plain,
batch_size=args.batch_size_eval,
drop_last=False,
sampler=SubsetRandomSampler(list(range(0, len(dataset_val_plain), 50))),
num_workers=n_workers,
)
# Logging
writer = SummaryWriter(logging_path)
writer.add_text("arguments", json.dumps(vars(args)))
# Neural network related
student_vit = timm.create_model(vit_name, pretrained=args.pretrained)
teacher_vit = timm.create_model(vit_name, pretrained=args.pretrained)
student = MultiCropWrapper(
student_vit,
Head(
dim,
args.out_dim,
norm_last_layer=args.norm_last_layer,
),
)
teacher = MultiCropWrapper(teacher_vit, Head(dim, args.out_dim))
student, teacher = student.to(device), teacher.to(device)
teacher.load_state_dict(student.state_dict())
for p in teacher.parameters():
p.requires_grad = False
# Loss related
loss_inst = Loss(
args.out_dim,
teacher_temp=args.teacher_temp,
student_temp=args.student_temp,
).to(device)
lr = 0.0005 * args.batch_size / 256
optimizer = torch.optim.AdamW(
student.parameters(),
lr=lr,
weight_decay=args.weight_decay,
)
# Training loop
n_batches = len(dataset_train_aug) // args.batch_size
best_acc = 0
n_steps = 0
for e in range(args.n_epochs):
for i, (images, _) in tqdm.tqdm(
enumerate(data_loader_train_aug), total=n_batches
):
if n_steps % args.logging_freq == 0:
student.eval()
# Embedding
embs, imgs, labels_ = compute_embedding(
student.backbone,
data_loader_val_plain_subset,
)
writer.add_embedding(
embs,
metadata=[label_mapping[l] for l in labels_],
label_img=imgs,
global_step=n_steps,
tag="embeddings",
)
# KNN
current_acc = compute_knn(
student.backbone,
data_loader_train_plain,
data_loader_val_plain,
)
writer.add_scalar("knn-accuracy", current_acc, n_steps)
if current_acc > best_acc:
torch.save(student, logging_path / "best_model.pth")
best_acc = current_acc
student.train()
images = [img.to(device) for img in images]
teacher_output = teacher(images[:2])
student_output = student(images)
loss = loss_inst(student_output, teacher_output)
optimizer.zero_grad()
loss.backward()
clip_gradients(student, args.clip_grad)
optimizer.step()
with torch.no_grad():
for student_ps, teacher_ps in zip(
student.parameters(), teacher.parameters()
):
teacher_ps.data.mul_(args.momentum_teacher)
teacher_ps.data.add_(
(1 - args.momentum_teacher) * student_ps.detach().data
)
writer.add_scalar("train_loss", loss, n_steps)
n_steps += 1
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/dino/utils.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from PIL import Image
class DataAugmentation:
"""Create crops of an input image together with additional augmentation.
It generates 2 global crops and `n_local_crops` local crops.
Parameters
----------
global_crops_scale : tuple
Range of sizes for the global crops.
local_crops_scale : tuple
Range of sizes for the local crops.
n_local_crops : int
Number of local crops to create.
size : int
The size of the final image.
Attributes
----------
global_1, global_2 : transforms.Compose
Two global transforms.
local : transforms.Compose
Local transform. Note that the augmentation is stochastic so one
instance is enough and will lead to different crops.
"""
def __init__(
self,
global_crops_scale=(0.4, 1),
local_crops_scale=(0.05, 0.4),
n_local_crops=8,
size=224,
):
self.n_local_crops = n_local_crops
RandomGaussianBlur = lambda p: transforms.RandomApply( # noqa
[transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2))],
p=p,
)
flip_and_jitter = transforms.Compose(
[
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomApply(
[
transforms.ColorJitter(
brightness=0.4,
contrast=0.4,
saturation=0.2,
hue=0.1,
),
]
),
transforms.RandomGrayscale(p=0.2),
]
)
normalize = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
]
)
self.global_1 = transforms.Compose(
[
transforms.RandomResizedCrop(
size,
scale=global_crops_scale,
interpolation=Image.BICUBIC,
),
flip_and_jitter,
RandomGaussianBlur(1.0), # always apply
normalize,
],
)
self.global_2 = transforms.Compose(
[
transforms.RandomResizedCrop(
size,
scale=global_crops_scale,
interpolation=Image.BICUBIC,
),
flip_and_jitter,
RandomGaussianBlur(0.1),
transforms.RandomSolarize(170, p=0.2),
normalize,
],
)
self.local = transforms.Compose(
[
transforms.RandomResizedCrop(
size,
scale=local_crops_scale,
interpolation=Image.BICUBIC,
),
flip_and_jitter,
RandomGaussianBlur(0.5),
normalize,
],
)
def __call__(self, img):
"""Apply transformation.
Parameters
----------
img : PIL.Image
Input image.
Returns
-------
all_crops : list
List of `torch.Tensor` representing different views of
the input `img`.
"""
all_crops = []
all_crops.append(self.global_1(img))
all_crops.append(self.global_2(img))
all_crops.extend([self.local(img) for _ in range(self.n_local_crops)])
return all_crops
class Head(nn.Module):
"""Network hooked up to the CLS token embedding.
Just a MLP with the last layer being normalized in a particular way.
Parameters
----------
in_dim : int
The dimensionality of the token embedding.
out_dim : int
The dimensionality of the final layer (we compute the softmax over).
hidden_dim : int
Dimensionality of the hidden layers.
bottleneck_dim : int
Dimensionality of the second last layer.
n_layers : int
The number of layers.
norm_last_layer : bool
If True, then we freeze the norm of the weight of the last linear layer
to 1.
Attributes
----------
mlp : nn.Sequential
Vanilla multi-layer perceptron.
last_layer : nn.Linear
Reparametrized linear layer with weight normalization. That means
that that it will have `weight_g` and `weight_v` as learnable
parameters instead of a single `weight`.
"""
def __init__(
self,
in_dim,
out_dim,
hidden_dim=512,
bottleneck_dim=256,
n_layers=3,
norm_last_layer=False,
):
super().__init__()
if n_layers == 1:
self.mlp = nn.Linear(in_dim, bottleneck_dim)
else:
layers = [nn.Linear(in_dim, hidden_dim)]
layers.append(nn.GELU())
for _ in range(n_layers - 2):
layers.append(nn.Linear(hidden_dim, hidden_dim))
layers.append(nn.GELU())
layers.append(nn.Linear(hidden_dim, bottleneck_dim))
self.mlp = nn.Sequential(*layers)
self.apply(self._init_weights)
self.last_layer = nn.utils.weight_norm(
nn.Linear(bottleneck_dim, out_dim, bias=False)
)
self.last_layer.weight_g.data.fill_(1)
if norm_last_layer:
self.last_layer.weight_g.requires_grad = False
def _init_weights(self, m):
"""Initialize learnable parameters."""
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, std=0.02)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x):
"""Run forward pass.
Parameters
----------
x : torch.Tensor
Of shape `(n_samples, in_dim)`.
Returns
-------
torch.Tensor
Of shape `(n_samples, out_dim)`.
"""
x = self.mlp(x) # (n_samples, bottleneck_dim)
x = nn.functional.normalize(x, dim=-1, p=2) # (n_samples, bottleneck_dim)
x = self.last_layer(x) # (n_samples, out_dim)
return x
class MultiCropWrapper(nn.Module):
"""Convenience class for forward pass of multiple crops.
Parameters
----------
backbone : timm.models.vision_transformer.VisionTransformer
Instantiated Vision Transformer. Note that we will take the `head`
attribute and replace it with `nn.Identity`.
new_head : Head
New head that is going to be put on top of the `backbone`.
"""
def __init__(self, backbone, new_head):
super().__init__()
backbone.head = nn.Identity() # deactivate original head
self.backbone = backbone
self.new_head = new_head
def forward(self, x):
"""Run the forward pass.
The different crops are concatenated along the batch dimension
and then a single forward pass is fun. The resulting tensor
is then chunked back to per crop tensors.
Parameters
----------
x : list
List of `torch.Tensor` each of shape `(n_samples, 3, size, size)`.
Returns
-------
tuple
Tuple of `torch.Tensor` each of shape `(n_samples, out_dim)` where
`output_dim` is determined by `Head`.
"""
n_crops = len(x)
concatenated = torch.cat(x, dim=0) # (n_samples * n_crops, 3, size, size)
cls_embedding = self.backbone(concatenated) # (n_samples * n_crops, in_dim)
logits = self.new_head(cls_embedding) # (n_samples * n_crops, out_dim)
chunks = logits.chunk(n_crops) # n_crops * (n_samples, out_dim)
return chunks
class Loss(nn.Module):
"""The loss function.
We subclass the `nn.Module` becuase we want to create a buffer for the
logits center of the teacher.
Parameters
----------
out_dim : int
The dimensionality of the final layer (we computed the softmax over).
teacher_temp, student_temp : float
Softmax temperature of the teacher resp. student.
center_momentum : float
Hyperparameter for the exponential moving average that determines
the center logits. The higher the more the running average matters.
"""
def __init__(
self, out_dim, teacher_temp=0.04, student_temp=0.1, center_momentum=0.9
):
super().__init__()
self.student_temp = student_temp
self.teacher_temp = teacher_temp
self.center_momentum = center_momentum
self.register_buffer("center", torch.zeros(1, out_dim))
def forward(self, student_output, teacher_output):
"""Evaluate loss.
Parameters
----------
student_output, teacher_output : tuple
Tuple of tensors of shape `(n_samples, out_dim)` representing
logits. The length is equal to number of crops.
Note that student processed all crops and that the two initial crops
are the global ones.
Returns
-------
loss : torch.Tensor
Scalar representing the average loss.
"""
student_temp = [s / self.student_temp for s in student_output]
teacher_temp = [(t - self.center) / self.teacher_temp for t in teacher_output]
student_sm = [F.log_softmax(s, dim=-1) for s in student_temp]
teacher_sm = [F.softmax(t, dim=-1).detach() for t in teacher_temp]
total_loss = 0
n_loss_terms = 0
for t_ix, t in enumerate(teacher_sm):
for s_ix, s in enumerate(student_sm):
if t_ix == s_ix:
continue
loss = torch.sum(-t * s, dim=-1) # (n_samples,)
total_loss += loss.mean() # scalar
n_loss_terms += 1
total_loss /= n_loss_terms
self.update_center(teacher_output)
return total_loss
@torch.no_grad()
def update_center(self, teacher_output):
"""Update center used for teacher output.
Compute the exponential moving average.
Parameters
----------
teacher_output : tuple
Tuple of tensors of shape `(n_samples, out_dim)` where each
tensor represents a different crop.
"""
batch_center = torch.cat(teacher_output).mean(
dim=0, keepdim=True
) # (1, out_dim)
self.center = self.center * self.center_momentum + batch_center * (
1 - self.center_momentum
)
def clip_gradients(model, clip=2.0):
"""Rescale norm of computed gradients.
Parameters
----------
model : nn.Module
Module.
clip : float
Maximum norm.
"""
for p in model.parameters():
if p.grad is not None:
param_norm = p.grad.data.norm(2)
clip_coef = clip / (param_norm + 1e-6)
if clip_coef < 1:
p.grad.data.mul_(clip_coef)
================================================
FILE: github_adventures/dino/visualize_attentions.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "1a3bd5ec",
"metadata": {},
"outputs": [],
"source": [
"import ipywidgets\n",
"import matplotlib.pyplot as plt\n",
"import timm\n",
"import torch\n",
"from torchvision.datasets import ImageFolder\n",
"import torchvision.transforms as transforms\n",
"from torchvision.utils import make_grid\n",
"import torch.nn.functional as F"
]
},
{
"cell_type": "markdown",
"id": "a6eaa0ef",
"metadata": {},
"source": [
"# Helpers"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c0b2e7c",
"metadata": {},
"outputs": [],
"source": [
"def get_last_attention(backbone, x):\n",
" \"\"\"Get the attention weights of CLS from the last self-attention layer.\n",
"\n",
" Very hacky!\n",
"\n",
" Parameters\n",
" ----------\n",
" backbone : timm.models.vision_transformer.VisionTransformer\n",
" Instantiated Vision Transformer. Note that we will in-place\n",
" take the `head` attribute and replace it with `nn.Identity`.\n",
"\n",
" x : torch.Tensor\n",
" Batch of images of shape `(n_samples, 3, size, size)`.\n",
"\n",
" Returns\n",
" -------\n",
" torch.Tensor\n",
" Attention weights `(n_samples, n_heads, n_patches)`.\n",
" \"\"\"\n",
" attn_module = backbone.blocks[-1].attn\n",
" n_heads = attn_module.num_heads\n",
"\n",
" # define hook\n",
" inp = None\n",
" def fprehook(self, inputs):\n",
" nonlocal inp\n",
" inp = inputs[0]\n",
"\n",
" # Register a hook\n",
" handle = attn_module.register_forward_pre_hook(fprehook)\n",
"\n",
" # Run forward pass\n",
" _ = backbone(x)\n",
" handle.remove()\n",
"\n",
" B, N, C = inp.shape\n",
" qkv = attn_module.qkv(inp).reshape(B, N, 3, n_heads, C // n_heads).permute(2, 0, 3, 1, 4)\n",
" q, k, v = qkv[0], qkv[1], qkv[2]\n",
"\n",
" attn = (q @ k.transpose(-2, -1)) * attn_module.scale\n",
" attn = attn.softmax(dim=-1)\n",
"\n",
" return attn[:, :, 0, 1:]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57b72b84",
"metadata": {},
"outputs": [],
"source": [
"def threshold(attn, k=30):\n",
" n_heads = len(attn)\n",
" indices = attn.argsort(dim=1, descending=True)[:, k:]\n",
"\n",
" for head in range(n_heads):\n",
" attn[head, indices[head]] = 0\n",
"\n",
" attn /= attn.sum(dim=1, keepdim=True)\n",
"\n",
" return attn"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "59e9009d",
"metadata": {},
"outputs": [],
"source": [
"def visualize_attention(img, backbone, k=30):\n",
" \"\"\"Create attention image.\n",
"\n",
" Parameteres\n",
" -----------\n",
" img : PIL.Image\n",
" RGB image.\n",
"\n",
" backbone : timm.models.vision_transformer.VisionTransformer\n",
" The vision transformer.\n",
"\n",
" Returns\n",
" -------\n",
" new_img : torch.Tensor\n",
" Image of shape (n_heads, 1, height, width).\n",
" \"\"\"\n",
" # imply parameters\n",
"\n",
" patch_size = backbone.patch_embed.proj.kernel_size[0]\n",
"\n",
" transform = transforms.Compose([\n",
"\n",
" transforms.Resize((224, 224)),\n",
" transforms.ToTensor(),\n",
" transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),\n",
" ]\n",
" )\n",
"\n",
" device = next(backbone.parameters()).device\n",
" x = transform(img)[None, ...].to(device)\n",
" attn = get_last_attention(backbone, x)[0] # (n_heads, n_patches)\n",
" attn = attn / attn.sum(dim=1, keepdim=True) # (n_heads, n_patches)\n",
" attn = threshold(attn, k)\n",
" attn = attn.reshape(-1, 14, 14) # (n_heads, 14, 14)\n",
" attn = F.interpolate(attn.unsqueeze(0),\n",
" scale_factor=patch_size,\n",
" mode=\"nearest\"\n",
" )[0]\n",
"\n",
" return attn"
]
},
{
"cell_type": "markdown",
"id": "df0972ec",
"metadata": {},
"source": [
"# Preparation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6e0d987",
"metadata": {},
"outputs": [],
"source": [
"models = {\n",
" \"supervised\": timm.create_model(\"vit_deit_small_patch16_224\", pretrained=True),\n",
" \"selfsupervised\": torch.load(\"best_model.pth\", map_location=\"cpu\").backbone,\n",
"}\n",
"dataset = ImageFolder(\"data/imagenette2-320/val\")\n",
"\n",
"colors = [\"yellow\", \"red\", \"green\", \"blue\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "690e3a1f",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"@ipywidgets.interact\n",
"def _(\n",
" i=ipywidgets.IntSlider(min=0, max=len(dataset) - 1, continuous_update=False),\n",
" k=ipywidgets.IntSlider(min=0, max=195, value=10, continuous_update=False),\n",
" model=ipywidgets.Dropdown(options=[\"supervised\", \"selfsupervised\"]),\n",
"):\n",
" img = dataset[i][0]\n",
" attns = visualize_attention(img, models[model], k=k).detach()[:].permute(1, 2, 0).numpy()\n",
"\n",
" tform = transforms.Compose([\n",
"\n",
" transforms.Resize((224, 224)),\n",
" ])\n",
" # original image\n",
" plt.imshow(tform(img))\n",
" plt.axis(\"off\")\n",
" plt.show()\n",
"\n",
" kwargs = {\"vmin\": 0, \"vmax\": 0.24}\n",
" # Attentions\n",
" n_heads = 6\n",
"\n",
" fig, axs = plt.subplots(2, 3, figsize=(10, 7))\n",
" \n",
" for i in range(n_heads):\n",
" ax = axs[i // 3, i % 3]\n",
" ax.imshow(attns[..., i], **kwargs)\n",
" ax.axis(\"off\")\n",
" \n",
" plt.tight_layout()\n",
" \n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d83eae10",
"metadata": {},
"outputs": [],
"source": [
"# 3244, 1942, 3482, 688, 1509, 3709"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
================================================
FILE: github_adventures/dino/visualize_augmentations.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "5801191a",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"import ipywidgets\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import torch\n",
"from PIL import Image\n",
"from torchvision.datasets import ImageFolder\n",
"\n",
"from utils import DataAugmentation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad4f7f91",
"metadata": {},
"outputs": [],
"source": [
"def to_numpy(t):\n",
" array = torch.clip((t * 0.224) + 0.45, 0, 1).permute(1, 2, 0).numpy()\n",
" return array\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "db09874a",
"metadata": {},
"outputs": [],
"source": [
"transform = DataAugmentation(n_local_crops=2)\n",
"dataset = ImageFolder(\"data/imagenette2-320/train/\", transform=transform)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "48738037",
"metadata": {},
"outputs": [],
"source": [
"@ipywidgets.interact\n",
"def _(\n",
" i=ipywidgets.IntSlider(min=0, max=len(dataset) - 1, continuous_update=False),\n",
" seed=ipywidgets.IntSlider(min=0, max=50, continuous_update=False),\n",
"):\n",
" torch.manual_seed(seed)\n",
" all_crops, _ = dataset[i]\n",
" titles = [\"Global 1\", \"Global 2\", \"Local 1\", \"Local 2\"]\n",
" \n",
" original_img = np.array(Image.open(dataset.samples[i][0]))\n",
" _, ax_orig = plt.subplots(figsize=(15, 5))\n",
" ax_orig.imshow(original_img)\n",
" ax_orig.set_title(\"Original\")\n",
" ax_orig.axis(\"off\")\n",
" \n",
" \n",
" fig, axs = plt.subplots(2, 2, figsize=(10, 10))\n",
" \n",
" for i, title in enumerate(titles):\n",
" ax = axs[i // 2, i % 2]\n",
" ax.imshow(to_numpy(all_crops[i]))\n",
" ax.set_title(title)\n",
" ax.axis(\"off\")\n",
" fig.tight_layout()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
================================================
FILE: github_adventures/gpt/README.md
================================================
# GPT-2 custom implementation
## Installation
```python
pip install -r requirements.txt
```
## Launching script
To copy weights of an official model + generate some text use the script
`copy_and_generate.py`
```python
(gpt) gpt$ python copy_and_generate.py --help
usage: Copy weights of a HF model and generate text. [-h] [--sample] [-s STEPS] [-r RANDOM_STATE]
[-t TEMPERATURE] [-k TOP_K] [-v]
{gpt2,gpt2-medium,gpt2-large,distilgpt2}
initial_text
positional arguments:
{gpt2,gpt2-medium,gpt2-large,distilgpt2}
Pretrained model to use
initial_text Initial text
optional arguments:
-h, --help show this help message and exit
--sample If True sample randomly otherwise take the most probable token (default: False)
-s STEPS, --steps STEPS
Number of new tokens to generate (default: 30)
-r RANDOM_STATE, --random-state RANDOM_STATE
Random state (default: None)
-t TEMPERATURE, --temperature TEMPERATURE
Softmax logits temperature (default: 1)
-k TOP_K, --top-k TOP_K
If specified, then selecting k most probable tokens (default: None)
-v, --verbose If True, then verbose (default: False)
```
================================================
FILE: github_adventures/gpt/copy_and_generate.py
================================================
import argparse
import logging
import torch
from model import GPT
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils import copy_model, generate_token
logging.basicConfig(format="[%(levelname)s] %(asctime)s %(message)s")
logger = logging.getLogger(__file__)
def main(argv=None):
"""Copy weights and generate some text."""
parser = argparse.ArgumentParser(
"Copy weights of a HF model and generate text.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"model_name",
type=str,
choices=("gpt2", "gpt2-medium", "gpt2-large", "distilgpt2"),
help="Pretrained model to use",
)
parser.add_argument(
"initial_text",
type=str,
help="Initial text",
)
parser.add_argument(
"--sample",
action="store_true",
help="If True sample randomly otherwise take the most probable token",
)
parser.add_argument(
"-s",
"--steps",
default=30,
type=int,
help="Number of new tokens to generate",
)
parser.add_argument("-r", "--random-state", type=int, help="Random state")
parser.add_argument(
"-t",
"--temperature",
default=1,
type=float,
help="Softmax logits temperature",
)
parser.add_argument(
"-k",
"--top-k",
type=int,
help="If specified, then selecting k most probable tokens",
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="If True, then verbose"
)
args = parser.parse_args(argv)
# Setup logging
if args.verbose:
logger.setLevel(logging.INFO)
else:
logger.setLevel(logging.WARNING)
logger.info(f"CLI parameters: {vars(args)})")
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
model_official = AutoModelForCausalLM.from_pretrained(args.model_name)
config_official = model_official.config
our_params = [
"vocab_size",
"n_layer",
"n_embd",
"n_head",
"n_positions",
"attn_pdrop",
"embd_pdrop",
"resid_pdrop",
"layer_norm_epsilon",
]
config_ours = {k: getattr(config_official, k) for k in our_params}
logger.info(f"Model hyperparameters: {config_ours}")
model_ours = GPT(**config_ours)
model_ours.eval()
copy_model(model_official, model_ours)
token_ixs = tokenizer(args.initial_text)["input_ids"]
if args.random_state:
torch.manual_seed(args.random_state)
# Sample
for step in range(args.steps):
new_token_ix = generate_token(
model_ours,
token_ixs,
sample=args.sample,
top_k=args.top_k,
temperature=args.temperature,
)
token_ixs.append(new_token_ix)
logger.info(f"Step {step} done")
text = tokenizer.decode(token_ixs)
print(text)
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/gpt/distribution_visualizations.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "896ffe86",
"metadata": {},
"outputs": [],
"source": [
"import ipywidgets\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import torch"
]
},
{
"cell_type": "markdown",
"id": "09b6e1f4",
"metadata": {},
"source": [
"#
Applying temperature + keeping only top K values"
]
},
{
"cell_type": "markdown",
"id": "2c7442cf",
"metadata": {},
"source": [
"$T=\\mbox{temperature}$ $$\\large P_i=\\frac{e^{\\frac{y_i}T}}{\\sum_{k=1}^n e^{\\frac{y_k}T}}$$"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "95833de6",
"metadata": {},
"outputs": [],
"source": [
"@ipywidgets.interact\n",
"def _(\n",
" n_tokens=ipywidgets.IntSlider(min=4, max=30, value=8, continuous_update=False),\n",
" random_state=ipywidgets.IntSlider(min=0, max=10, value=2, continuous_update=False),\n",
" temperature=ipywidgets.FloatSlider(min=0, max=10, value=1, continuous_update=False),\n",
" top_k=ipywidgets.IntSlider(min=1, max=20, value=8, continuous_update=False),\n",
" ):\n",
" # Preparations\n",
" top_k = min(top_k, n_tokens)\n",
" torch.manual_seed(random_state)\n",
" logits = 10 * torch.rand(n_tokens,)\n",
"\n",
"\n",
" # Generate original\n",
" probs_orig = torch.nn.functional.softmax(logits, dim=0).numpy()\n",
" \n",
" # Generate new\n",
" logits = logits / temperature\n",
" top_values, _ = torch.topk(logits, top_k) # (top_k,) \n",
" logits[logits < top_values.min()] = -torch.inf \n",
" probs_new = torch.nn.functional.softmax(logits, dim=0).numpy()\n",
"\n",
" # Plotting\n",
" fig, (ax_orig, ax_new) = plt.subplots(1, 2, sharey=True, figsize=(10, 2), dpi=100)\n",
" x = range(n_tokens)\n",
"\n",
" ax_orig.bar(x, probs_orig)\n",
" ax_orig.set_ylim((0, 1))\n",
" ax_orig.set_title(\"Original\")\n",
" \n",
" ax_new.bar(x, probs_new)\n",
" ax_new.set_title(\"Temperature + top K\")\n",
" \n",
" plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
================================================
FILE: github_adventures/gpt/ipython_code.py
================================================
>>> import torch
>>> from model import GPT
>>> from transformers import AutoModelForCausalLM
>>> hparams_names = [
... "vocab_size",
... "n_layer",
... "n_embd",
... "n_head",
... "n_positions",
... "attn_pdrop",
... "embd_pdrop",
... "resid_pdrop",
... "layer_norm_epsilon",
... ]
...
>>> model_name = "gpt2"
>>> model_official = AutoModelForCausalLM.from_pretrained(model_name, tie_word_embeddings=False)
>>> config_official = model_official.config
>>> config_official
>>> config_ours = {name: getattr(config_official, name) for name in hparams_names}
>>> config_ours
>>> model_ours = GPT(**config_ours)
>>> sum(p.numel() for p in model_ours.parameters())
>>> sum(p.numel() for p in model_official.parameters())
>>> _ = model_official.eval()
>>> _ = model_ours.eval()
>>> idx = torch.tensor([[1, 123, 52, 28]], dtype=torch.long)
>>> logits_official = model_official(idx).logits
>>> logits_ours = model_ours(idx)
>>> logits_official.shape
>>> logits_ours.shape
>>> torch.allclose(logits_ours, logits_official, rtol=0, atol=1e-3)
>>> (logits_ours - logits_official).abs().max()
>>> from utils import copy_model
>>> copy_model(model_official, model_ours)
>>> logits_official = model_official(idx).logits
>>> logits_ours = model_ours(idx)
>>> torch.allclose(logits_ours, logits_official, rtol=0, atol=1e-3)
>>> (logits_ours - logits_official).abs().max()
================================================
FILE: github_adventures/gpt/model.py
================================================
import torch
import torch.nn as nn
from transformers.activations import gelu_new
class CustomGELU(nn.Module):
"""GELU implementation taken from the `transformers`."""
def forward(self, x):
"""Run forward pass."""
return gelu_new(x)
class Block(nn.Module):
"""Decoder block.
Parameters
----------
n_embd : int
Dimensionality of the embeddings.
n_head : int
Number of attention heads.
n_positions : int
Maximum number of tokens.
attn_pdrop : float
Probability of dropout on attention weights.
resid_pdrop : float
Probability of dropout after applying the MLP.
layer_norm_epsilon : float
Hyperparameter of layer normalization.
Attributes
----------
ln_1, ln_2 : nn.LayerNorm
Layer norms.
attention : nn.MultiHeadAttention
Attention module.
mlp : nn.Sequential
Multilayer perceptron.
"""
def __init__(
self,
*,
n_embd,
n_head,
n_positions,
attn_pdrop,
resid_pdrop,
layer_norm_epsilon,
):
super().__init__()
self.ln_1 = nn.LayerNorm(n_embd, eps=layer_norm_epsilon)
self.ln_2 = nn.LayerNorm(n_embd, eps=layer_norm_epsilon)
self.attention = nn.MultiheadAttention(
embed_dim=n_embd,
num_heads=n_head,
dropout=attn_pdrop,
bias=True,
batch_first=True,
)
self.register_buffer(
"mask",
(1 - torch.tril(torch.ones(n_positions, n_positions))).to(
dtype=torch.bool
),
)
self.mlp = nn.Sequential(
nn.Linear(n_embd, 4 * n_embd),
CustomGELU(),
nn.Linear(4 * n_embd, n_embd),
nn.Dropout(resid_pdrop),
)
def forward(self, x):
"""Run forward pass.
Parameters
----------
x : torch.Tensor
Input tensor of shape `(batch_size, n_tokens, n_embd)`.
Returns
-------
torch.Tensor
Output tensor of shape `(batch_size, n_tokens, n_embd)`.
"""
batch_size, n_tokens, n_embd = x.shape
x_ = self.ln_1(x) # (batch_size, n_tokens, n_embd)
mask = self.mask[:n_tokens, :n_tokens] # (n_tokens, n_tokens)
attn_out, _ = self.attention(
x_, x_, x_, attn_mask=mask, need_weights=False
) # (batch_size, n_tokens, n_embd)
x = x + attn_out # (batch_size, n_tokens, n_embd)
x = x + self.mlp(self.ln_2(x)) # (batch_size, n_tokens, n_embd)
return x
class GPT(nn.Module):
"""Entire GPT model.
Parameters
----------
vocab_size : int
Number of tokens in the vocabulary.
n_layer : int
Number of decoder blocks to include.
n_embd : int
Dimensionality of the embeddings.
n_head : int
Number of attention heads.
n_positions : int
Maximum number of tokens.
attn_pdrop : float
Probability of dropout on attention weights.
embd_pdrop : float
Probability of dropout on the sum of embeddings.
resid_pdrop : float
Probability of dropout after applying the MLP.
layer_norm_epsilon : float
Hyperparameter of layer normalization.
Attributes
----------
token_emb : nn.Embedding
Token embeddings.
pos_emb : nn.Embedding
Positional embedding.
drop : nn.Dropout
Dropout module to be applied on the sum of embeddings.
blocks : nn.Sequential
List of decoder blocks.
ln : nn.LayerNorm
Layer norm applied before applying `head`.
head : nn.Linear
Final linear layer.
"""
def __init__(
self,
*,
vocab_size,
n_layer,
n_embd,
n_head,
n_positions,
attn_pdrop,
embd_pdrop,
resid_pdrop,
layer_norm_epsilon,
):
super().__init__()
self.n_positions = n_positions
self.token_emb = nn.Embedding(vocab_size, n_embd)
self.pos_emb = nn.Embedding(n_positions, n_embd)
self.drop = nn.Dropout(embd_pdrop)
self.blocks = nn.Sequential(
*[
Block(
n_embd=n_embd,
n_head=n_head,
n_positions=n_positions,
attn_pdrop=attn_pdrop,
resid_pdrop=resid_pdrop,
layer_norm_epsilon=layer_norm_epsilon,
)
for _ in range(n_layer)
]
)
self.ln = nn.LayerNorm(n_embd, eps=layer_norm_epsilon)
self.head = nn.Linear(n_embd, vocab_size, bias=False)
def forward(self, idx):
"""Run forward pass.
Parameters
----------
idx : torch.Tensor
Integer tensor of shape `(batch_size, n_tokens)` where each
element is in the range `[0, vocab_size)`.
Returns
-------
logits : torch.Tensor
Tensor of shape `(batch_size, n_tokens, vocab_size)`.
"""
batch_size, n_tokens = idx.shape
device = idx.device
if n_tokens > self.n_positions:
raise ValueError("There are too many tokens in the input")
positions = torch.arange(n_tokens, device=device) # (n_tokens,)
token_emb = self.token_emb(idx) # (batch_size, n_tokens, n_embd)
pos_emb = self.pos_emb(positions)[None, ...] # (1, n_tokens, n_embd)
x = self.drop(token_emb + pos_emb) # (batch_size, n_tokens, n_embd)
x = self.blocks(x) # (batch_size, n_tokens, n_embd)
x = self.ln(x) # (batch_size, n_tokens, n_embd)
logits = self.head(x) # (batch_size, n_tokens, vocab_size)
return logits
================================================
FILE: github_adventures/gpt/requirements.txt
================================================
ipython==7.30.1
ipywidgets==7.6.5
jupyter==1.0.0
matplotlib==3.5.1
numpy==1.21.5
torch==1.10.1
-e git+https://github.com/huggingface/transformers.git@05fa1a7ac17bb7aa07b9e0c1e138ecb31a28bbfe#egg=transformers
================================================
FILE: github_adventures/gpt/utils.py
================================================
import torch
def copy_parameter(param_official, param_ours):
"""Copy values of one tensor to another tensor.
Parameters
----------
param_official : torch.Tensor
The value of this tensor will be copied.
param_ours : torch.Tensor
This tensor will be overwritten in-place with the values from
`param_official`.
"""
if param_official.shape != param_ours.shape:
raise ValueError("The shapes of the provided tensors are different")
with torch.no_grad():
param_ours.copy_(param_official)
def copy_block(block_official, block_ours):
"""Copy all parameters within a transformer block.
Parameters
----------
block_official : transformers.models.gpt2.modeling_gpt2.GPT2Block
Block coming from the huggingface code.
block_ours : Block
Our block.
"""
b_a = block_official
b_b = block_ours
# LN 1
copy_parameter(b_a.ln_1.weight, b_b.ln_1.weight)
copy_parameter(b_a.ln_1.bias, b_b.ln_1.bias)
# Attention
copy_parameter(b_a.attn.c_attn.weight.T, b_b.attention.in_proj_weight)
copy_parameter(b_a.attn.c_attn.bias, b_b.attention.in_proj_bias)
copy_parameter(b_a.attn.c_proj.weight.T, b_b.attention.out_proj.weight)
copy_parameter(b_a.attn.c_proj.bias, b_b.attention.out_proj.bias)
# LN 2
copy_parameter(b_a.ln_2.weight, b_b.ln_2.weight)
copy_parameter(b_a.ln_2.bias, b_b.ln_2.bias)
# MLP
copy_parameter(b_a.mlp.c_fc.weight.T, b_b.mlp[0].weight)
copy_parameter(b_a.mlp.c_fc.bias, b_b.mlp[0].bias)
copy_parameter(b_a.mlp.c_proj.weight.T, b_b.mlp[2].weight)
copy_parameter(b_a.mlp.c_proj.bias, b_b.mlp[2].bias)
def copy_model(model_official, model_ours):
"""Copy all trainable weights.
Parameters
----------
model_official : transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel
Huggingface model.
model_ours : GPT
Our model.
"""
m_a = model_official
m_b = model_ours
# Token and positional embeddings
copy_parameter(m_a.transformer.wpe.weight, m_b.pos_emb.weight)
copy_parameter(m_a.transformer.wte.weight, m_b.token_emb.weight)
# Blocks
for block_official, block_ours in zip(m_a.transformer.h, m_b.blocks):
copy_block(block_official, block_ours)
# Head
copy_parameter(m_a.transformer.ln_f.weight, m_b.ln.weight)
copy_parameter(m_a.transformer.ln_f.bias, m_b.ln.bias)
copy_parameter(m_a.lm_head.weight, m_b.head.weight)
@torch.no_grad()
def generate_token(
model, token_ixs, temperature=1.0, sample=False, top_k=None
):
"""Generate a single token given previous tokens.
Parameters
----------
model : GPT
Our GPT model.
token_ixs : list
List of conditional input token ids.
temperature : float
The higher the more variability and vice versa.
sample : bool
If True, we sample from the distribution (=there is randomness). If
False, we just take the argmax (=there is no randomness).
top_k : int or None
If not None then we modify the distribution to only contain the `top_k`
most probable outcomes.
Returns
-------
new_token_ix : int
Index of the new token.
"""
context_token_ixs = token_ixs[-model.n_positions :]
ixs = torch.tensor(context_token_ixs).to(dtype=torch.long)[
None, :
] # (1, n_tokens)
logits_all = model(ixs) # (1, n_tokens, vocab_size)
logits = logits_all[0, -1, :] # (vocab_size,)
logits = logits / temperature # (vocab_size,)
if top_k is not None:
# Find the top k biggest elements, set the remaining elements to -inf
top_values, _ = torch.topk(logits, top_k) # (top_k,)
logits[logits < top_values.min()] = -torch.inf
probs = torch.nn.functional.softmax(logits, dim=0) # (vocab_size,)
if sample:
new_token_ix = torch.multinomial(probs, num_samples=1)
else:
new_token_ix = probs.argmax()
return new_token_ix.item()
================================================
FILE: github_adventures/integer/README.md
================================================
# On-line encyclopedia of integer sequences
You can use the `fetch_data.py` to download the sequences. However,
I actually found out (after filming the video) that you can literally
download all the sequences here:
https://oeis.org/wiki/Welcome#Compressed_Versions
So you should probably do that and spare their API.
# The GloVe embeddings
The one that I used in the video are located here:
https://nlp.stanford.edu/data/glove.6B.zip
================================================
FILE: github_adventures/integer/bert.py
================================================
import argparse
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
from transformers import BertModel, BertTokenizer
from utils import create_classification_targets, train_classifier
def main(argv=None):
parser = argparse.ArgumentParser("Evaluating BERT integer embeddings")
parser.add_argument(
"log_folder",
type=str,
help="Folder where to log results",
)
parser.add_argument(
"--max-value-eval",
type=int,
default=500,
help="Number of integers to run the evaluation on",
)
args = parser.parse_args(argv)
model_name = "bert-base-uncased"
# Create writer
writer = SummaryWriter(args.log_folder)
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
# Retrieve embeddings
to_find = list(map(str, range(args.max_value_eval)))
positions = np.array(tokenizer.convert_tokens_to_ids(to_find))
unk_token_position = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
is_valid = positions != unk_token_position
print(
"The following numbers are missing",
[i for i, x in enumerate(is_valid) if not x],
)
arange = np.arange(args.max_value_eval)
numbers = arange[is_valid]
embeddings = (
model.embeddings.word_embeddings(torch.from_numpy(positions[is_valid]))
.detach()
.numpy()
)
ys_clf = create_classification_targets(numbers)
keys = sorted(ys_clf.keys())
metadata = np.array([numbers] + [ys_clf[k] for k in keys]).T.tolist()
metadata_header = ["value"] + keys
for name, y in ys_clf.items():
metrics = train_classifier(embeddings, y)
for metric_name, value in metrics.items():
writer.add_scalar(
f"{name}/{metric_name}",
value,
)
writer.add_embedding(
embeddings,
metadata=metadata,
metadata_header=metadata_header,
)
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/integer/experiments.sh
================================================
set -x
OUTPUT_PATH=results
GLOVE_PATH=glove.6B.300d.txt
SEQUENCES_PATH=raw_data.pkl
MAX_VALUE_EVAL=500
python glove.py --max-value-eval $MAX_VALUE_EVAL $GLOVE_PATH $OUTPUT_PATH/glove
python bert.py --max-value-eval $MAX_VALUE_EVAL $OUTPUT_PATH/BERT
python lstm.py \
$SEQUENCES_PATH \
$OUTPUT_PATH/LSTM \
--batch-size 128 \
--device cuda \
--embedding-dim 128 \
--hidden-dim 256 \
--max-value-eval $MAX_VALUE_EVAL \
--max-value 20000 \
--n-epochs 20000 \
--sequence-len 100
================================================
FILE: github_adventures/integer/fetch_data.py
================================================
import pathlib
import pickle
import requests
from joblib import Parallel, delayed, parallel_backend
def get_sequence(sequence_id):
"""Get an integer sequence from the online OEIS.
Parameters
----------
sequence_id : int
Unique identifier for the desired sequence.
Returns
-------
sequence : list
List of integers
Raises
------
HTTPError
Was not possible to get the given sequence
"""
url = f"https://oeis.org/search?fmt=json&q=id:A{sequence_id:07}"
print(sequence_id)
response = requests.get(url)
response.raise_for_status()
data_str = response.json()["results"][0]["data"]
sequence = [int(x) for x in data_str.split(",")]
return sequence
if __name__ == "__main__":
# Parameters
n_sequences = 5000
start_id = 1 # seems like 1 - 340_000 are valid sequences
n_jobs = 64
backend = "threading" # "threading" or "loky"
# Preparation
end_id = start_id + n_sequences
output_folder = pathlib.Path("data/")
output_folder.mkdir(exist_ok=True, parents=True)
output_path = output_folder / f"{start_id}_{end_id - 1}.pkl"
with parallel_backend(backend, n_jobs=n_jobs):
res = Parallel()(delayed(get_sequence)(i) for i in range(start_id, end_id))
with output_path.open("wb") as f:
pickle.dump(res, f)
================================================
FILE: github_adventures/integer/glove.py
================================================
import argparse
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from utils import create_classification_targets, train_classifier
def main(argv=None):
parser = argparse.ArgumentParser("Evaluating GloVe integer embeddings")
parser.add_argument(
"glove_path",
type=str,
help="Path to a txt file holding the GloVe embeddings",
)
parser.add_argument(
"log_folder",
type=str,
help="Folder where to log results",
)
parser.add_argument(
"--max-value-eval",
type=int,
default=500,
help="Number of integers to run the evaluation on",
)
parser.add_argument(
"--dim",
type=int,
default=300,
help="Dimensionality of the embeddings",
)
args = parser.parse_args()
# Create writer
writer = SummaryWriter(args.log_folder)
# Retrieve embeddings
to_find = set(map(str, range(args.max_value_eval)))
embeddings = np.empty((args.max_value_eval, args.dim))
with open(args.glove_path) as f:
for line in f:
token, *vector_ = line.split(" ")
if token in to_find:
embeddings[int(token)] = list(map(float, vector_))
to_find.remove(token)
assert not to_find
arange = np.arange(args.max_value_eval)
ys_clf = create_classification_targets(arange)
keys = sorted(ys_clf.keys())
metadata = np.array([arange] + [ys_clf[k] for k in keys]).T.tolist()
metadata_header = ["value"] + keys
for name, y in ys_clf.items():
metrics = train_classifier(embeddings, y)
for metric_name, value in metrics.items():
writer.add_scalar(
f"{name}/{metric_name}",
value,
)
writer.add_embedding(
embeddings,
metadata=metadata,
metadata_header=metadata_header,
)
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/integer/lstm.py
================================================
import argparse
import json
import pathlib
import pickle
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import tqdm
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from utils import (
CustomDataset,
Network,
create_classification_targets,
train_classifier,
)
def main(argv=None):
parser = argparse.ArgumentParser("Embedding integers using LSTM")
parser.add_argument(
"data_path", type=str, help="Path to the pickled sequences"
)
parser.add_argument(
"log_folder", type=str, help="Folder where to log results"
)
parser.add_argument(
"-b", "--batch-size", type=int, default=128, help="Batch size"
)
parser.add_argument(
"-d", "--dense-dim", type=int, default=256, help="Dense dimension"
)
parser.add_argument("--device", type=str, default="cpu", help="Device")
parser.add_argument(
"-e",
"--embedding-dim",
type=int,
default=128,
help="Embedding dimension",
)
parser.add_argument(
"--hidden-dim", type=int, default=256, help="Hidden dimension"
)
parser.add_argument(
"--max-value-eval",
type=int,
default=500,
help="Evaluation limit",
)
parser.add_argument(
"-m",
"--max-value",
type=int,
default=20000,
help="The maximum allowed value (non inclusive)",
)
parser.add_argument(
"-n", "--n-epochs", type=int, default=100, help="Number of epochs"
)
parser.add_argument(
"-l",
"--sequence-len",
type=int,
default=100,
help="The maximum length of a sequence",
)
args = parser.parse_args(argv)
# Preparations
device = torch.device(args.device)
eval_frequency = 500
log_folder = pathlib.Path(args.log_folder)
model_path = log_folder / "checkpoint.pth"
writer = SummaryWriter(log_folder)
writer.add_text("parameters", json.dumps(vars(args)))
# Dataset related
data_path = pathlib.Path(args.data_path)
with data_path.open("rb") as f:
raw_sequences = pickle.load(f)
dataset = CustomDataset(
raw_sequences,
max_value=args.max_value,
sequence_len=args.sequence_len,
)
fig, ax = plt.subplots()
ax.hist(dataset.normalized_sequences.ravel(), bins=100)
ax.set_title(
f"Number distribution (numbers={dataset.normalized_sequences.shape})"
)
writer.add_figure("number distribution", fig)
dataloader = DataLoader(
dataset,
shuffle=True,
batch_size=args.batch_size,
pin_memory=True,
)
# Newtork, loss and the optimizer
net = Network(
max_value=args.max_value,
hidden_dim=args.hidden_dim,
embedding_dim=args.embedding_dim,
dense_dim=args.dense_dim,
)
net.to(device)
loss_inst = nn.CrossEntropyLoss(
ignore_index=args.max_value,
)
optimizer = torch.optim.Adam(net.parameters())
# Validation preparation
max_value_eval = args.max_value_eval or args.max_value
arange = np.arange(max_value_eval)
ys_clf = create_classification_targets(arange)
keys = sorted(ys_clf.keys())
metadata = np.array([arange] + [ys_clf[k] for k in keys]).T.tolist()
metadata_header = ["value"] + keys
step = 0
for _ in range(args.n_epochs):
for x in tqdm.tqdm(dataloader):
x = x.to(device)
logits_ = net(x) # (batch_size, sequence_len, max_value)
logits = logits_[:, :-1].permute(
0, 2, 1
) # (batch_size, max_value, sequence_len - 1)
target = x[:, 1:] # (batch_size, sequence_len - 1)
loss = loss_inst(logits, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
writer.add_scalar("loss", loss, step)
if step % eval_frequency == 0:
X = (
net.embedding.weight.detach()
.cpu()
.numpy()[:max_value_eval]
)
writer.add_embedding(
X,
global_step=step,
tag="Integer embeddings",
metadata=metadata,
metadata_header=metadata_header,
)
for name, y in ys_clf.items():
metrics = train_classifier(X, y)
for metric_name, value in metrics.items():
writer.add_scalar(
f"{name}/{metric_name}",
value,
step,
)
torch.save(net, model_path)
step += 1
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/integer/requirements.txt
================================================
joblib
matplotlib
numpy
requests
scikit-learn
sympy
tensorboard
torch
transformers
================================================
FILE: github_adventures/integer/utils.py
================================================
import numpy as np
import torch
import torch.nn as nn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sympy.ntheory import isprime
from torch.utils.data import Dataset
class CustomDataset(Dataset):
"""Dataset containing integer sequences.
Parameters
----------
raw_sequences : list of list of str
Containing the original raw sequences. Note
that their length differs.
sequence_len : int
The lenght og the sequence. If the original sequence is shorter,
we just pad it with `max_value`. If the original sequence is longer
we simply cut if off.
max_value : int
The maximum allowed value (non inclusive). We will only consider
sequences that had the first `sequence_len` elements in
the range `[0, max_value)`.
Attributes
----------
normalized_sequences : np.ndarray
2D array of shape `(n_sequences, sequence_len)`. It only contains
sequences that had the first `sequence_len` elements in
the range `[0, max_value)`.
"""
def __init__(
self,
raw_sequences,
sequence_len=80,
max_value=2000,
):
filtered_sequences = list(
filter(
lambda seq: all(
0 <= x < max_value for x in seq[:sequence_len]
),
raw_sequences,
)
)
n_sequences = len(filtered_sequences)
self.normalized_sequences = max_value * np.ones(
(n_sequences, sequence_len),
dtype=np.int64,
)
for i, seq in enumerate(filtered_sequences):
actual_len = min(len(seq), sequence_len)
self.normalized_sequences[i, :actual_len] = seq[:actual_len]
def __len__(self):
"""Get the length of the dataset."""
return len(self.normalized_sequences)
def __getitem__(self, ix):
"""Get a single sample of the dataset."""
return self.normalized_sequences[ix]
class Network(nn.Module):
"""Network predicting next number in the sequence.
Parameters
----------
max_value : int
Maximum integer value allowed inside of the sequence. We will
generate an embedding for each of the numbers in `[0, max_value]`.
embedding_dim : int
Dimensionality of the integer embeddings.
n_layers : int
Number of layers inside of the LSTM.
hidden_dim : int
Dimensionality of the hidden state (LSTM).
dense_dim : int
Dimensionality of the dense layer.
Attributes
----------
embedding : torch.nn.Embedding
Embeddings of all the integers.
lstm : torch.nn.LSTM
LSTM subnetwork. Inputs integer embeddings and outputs
new hidden states.
linear : torch.nn.Linear
Inputs hidden states and tranforms them.
classifier : torch.nn.Linear
Inputs outputs of the `linear` and outputs the logits
over all possible integers.
"""
def __init__(
self,
max_value=2000,
embedding_dim=100,
n_layers=2,
hidden_dim=64,
dense_dim=256,
):
super().__init__()
self.embedding = nn.Embedding(
num_embeddings=max_value + 1,
embedding_dim=embedding_dim,
padding_idx=max_value,
)
self.lstm = nn.LSTM(
input_size=embedding_dim,
hidden_size=hidden_dim,
num_layers=n_layers,
batch_first=True,
)
self.linear = nn.Linear(
hidden_dim,
dense_dim,
)
self.classifier = nn.Linear(
dense_dim,
max_value,
)
def forward(self, x):
"""Run forward pass.
Parameters
----------
x : torch.Tensor
Input tensor of shape `(batch_size, sequence_len)` and has
dtype `torch.long`.
Returns
-------
logits : torch.Tensor
Logits over all possible integers of shape
`(batch_size, sequence_len, max_value)`.
"""
emb = self.embedding(x) # (batch_size, sequence_len, embedding_dim)
h, *_ = self.lstm(emb) # (batch_size, sequence_len, hidden_dim)
dense = torch.relu(
self.linear(h)
) # (batch_size, sequence_len, dense_dim)
logits = self.classifier(
dense
) # (batch_size, sequence_len, max_value)
return logits
def train_classifier(X, y, random_state=2):
"""Cross-validate classification problem using logistic regression.
Parameters
----------
X : np.ndarray
2D array holding the features of shape `(n_samples, n_features)`.
y : np.ndarray
1D array holding the classification targets of shape `(n_samples,)`.
random_state : int
Guaranteeing reproducibility.
Returns
-------
metrics : dict
Holds train and validation accuracy averaged over all the folds.
"""
cv = StratifiedKFold(
n_splits=5,
random_state=random_state,
shuffle=True,
)
clf = make_pipeline(
StandardScaler(),
LogisticRegression(
max_iter=2000,
random_state=random_state,
),
)
res = cross_validate(
clf,
X,
y,
return_train_score=True,
cv=cv,
)
metrics = {
"train_acc": res["train_score"].mean(),
"test_acc": res["test_score"].mean(),
}
return metrics
def create_classification_targets(indices):
"""Create multiple classification targets.
They represent common properties of integers.
Parameters
----------
indices : np.ndarray
1D array holding the integers for which we want to compute
the targets.
Returns
-------
targets : dict
Keys are property names and the values are arrays of the same shape
as `indices` representing whether a given integer does / does not
have a given property.
"""
targets = {
"divisibility_2": (indices % 2 == 0).astype(float),
"divisibility_3": (indices % 3 == 0).astype(float),
"divisibility_4": (indices % 4 == 0).astype(float),
"divisibility_5": (indices % 5 == 0).astype(float),
"divisibility_10": (indices % 10 == 0).astype(float),
"prime": np.vectorize(isprime)(indices).astype(float),
}
return targets
================================================
FILE: github_adventures/lottery/README.md
================================================
# The Lottery Ticket Hypothesis
## Installation
```bash
pip install -r requirements.txt
```
## Running experiments
The training logic is implemented inside of the script `main.py`. To
get more information about the CLI run
```bash
python main.py --help
```
If you want to run an entire grid search over different hyperparameters
you can use the `parallel_launch.sh` script. Note that it depends on a tool
called `parallel` ([more info](https://www.gnu.org/software/parallel/)). Note
that the script allows for dry runs (default behavior) and progress bars.
```bash
./parallel_launch.sh
```
================================================
FILE: github_adventures/lottery/data.py
================================================
from torch.utils.data import Dataset
from torchvision.datasets import MNIST
from torchvision.transforms import Compose, Lambda, ToTensor
class MNISTDataset(Dataset):
"""MNIST dataset.
Feature images are automatically flattened.
Parameters
----------
root : str
Directory where the actual data is located (or downloaded to).
train : bool
If True the training set is returned (60_000 samples). Otherwise
the validation set is returned (10_000 samples).
Attributes
----------
tv_dataset : MNIST
Instance of the torchvision `MNIST` dataset class.
"""
def __init__(self, root, train=True, download=True):
transform = Compose(
[
ToTensor(),
Lambda(lambda x: x.ravel()),
]
)
self.tv_dataset = MNIST(
root,
train=train,
download=download,
transform=transform,
)
def __len__(self):
"""Get the length of the dataset."""
return len(self.tv_dataset)
def __getitem__(self, ix):
"""Get a selected sample.
Parameters
----------
ix : int
Index of the sample to get.
Returns
-------
x : torch.Tensor
Flattened feature tensor of shape `(784,)`.
y : torch.Tensor
Scalar representing the ground truth label. Number between 0 and 9.
"""
return self.tv_dataset[ix]
================================================
FILE: github_adventures/lottery/main.py
================================================
import argparse
import torch
import torch.nn as nn
import tqdm
from torch.utils.data import DataLoader
import wandb
from data import MNISTDataset
from utils import MLP, compute_stats, copy_weights_mlp, prune_mlp, reinit_mlp
def loop_dataloader(dataloader):
"""Loop infinitely over a dataloader.
Parameters
----------
dataloader : DataLoader
DataLoader streaming batches of samples.
Yields
------
X_batch : torch.Tensor
Batch of features.
y_batch : torch.Tensor
Batch of predictions.
"""
while True:
for x in iter(dataloader):
yield x
def train(
model,
dataloader_train,
loss_inst,
optimizer,
max_iter=10_000,
dataloader_val=None,
val_freq=500,
):
"""Run the training loop.
Parameters
----------
model : nn.Module
Neural network (in our case MLP).
dataloader_train : DataLoader
Dataloader yielding training samples.
loss_inst : callable
Computes the loss when called.
optimizer : torch.optim.Optimizer
Instance of an optimizer.
max_iter : int
The number of iterations we run the training for
(= number of graident descent steps).
dataloader_val : None or DataLoader
Dataloader yielding validation samples. If provided it will
also single to us that we want to track metrics.
val_freq : int
How often evaluation run.
"""
iterable = loop_dataloader(dataloader_train)
iterable = tqdm.tqdm(iterable, total=max_iter)
it = 0
for X_batch, y_batch in iterable:
if it == max_iter:
break
logit_batch = model(X_batch)
loss = loss_inst(logit_batch, y_batch)
if dataloader_val is not None:
wandb.log({"loss": loss}, step=it)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if it % val_freq == 0 and dataloader_val is not None:
is_equal = []
for X_batch_val, y_batch_val in dataloader_val:
is_equal.append(
model(X_batch_val).argmax(dim=-1) == y_batch_val
)
is_equal_t = torch.cat(is_equal)
acc = is_equal_t.sum() / len(is_equal_t)
wandb.log({"accuracy_val": acc}, step=it)
it += 1
def main(argv=None):
"""Create CLI and run experiments."""
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"-i",
"--max-iter",
help="Number of iterations",
type=int,
default=50000,
)
parser.add_argument(
"-b",
"--batch-size",
help="Batch size",
type=int,
default=60,
)
parser.add_argument(
"--prune-iter",
help="Number of prune iterations",
type=int,
default=1,
)
parser.add_argument(
"-m",
"--prune-method",
help="Pruning method to employ",
type=str,
choices=("l1", "random"),
default="l1",
)
parser.add_argument(
"-p",
"--prune-ratio",
help="Percentage of weights to remove",
type=float,
default=0.2,
)
parser.add_argument(
"--val-freq",
help="How often to compute the validation accuracy",
type=int,
default=250,
)
parser.add_argument(
"-r",
"--reinitialize",
help="If true, reinitializes randomly all weights after pruning",
type=str,
choices=("true", "false"), # easy for hyperparameter search
default="false",
)
parser.add_argument(
"-s",
"--random-state",
help="Random state",
type=int,
)
parser.add_argument(
"--wandb-entity",
help="W&B entity",
type=str,
default="mildlyoverfitted",
)
parser.add_argument(
"--wandb-project",
help="W&B project",
type=str,
)
args = parser.parse_args(argv)
wandb.init(
project=args.wandb_project,
entity=args.wandb_entity,
config=vars(args),
)
wandb.define_metric("accuracy_val", summary="max")
dataset_train = MNISTDataset(
"data",
train=True,
download=True,
)
dataset_val = MNISTDataset(
"data",
train=False,
download=True,
)
if args.random_state is not None:
torch.manual_seed(args.random_state)
dataloader_train = DataLoader(
dataset_train, batch_size=args.batch_size, shuffle=True
)
dataloader_val = DataLoader(
dataset_val, batch_size=args.batch_size, shuffle=True
)
kwargs = dict(
n_features=28 * 28,
hidden_layer_sizes=(300, 100),
n_targets=10,
)
mlp = MLP(**kwargs)
mlp_copy = MLP(**kwargs)
mlp_copy.load_state_dict(mlp.state_dict())
loss_inst = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=1.2 * 1e-3)
# Train and prune loop
if args.prune_ratio > 0:
per_round_prune_ratio = 1 - (1 - args.prune_ratio) ** (
1 / args.prune_iter
)
per_round_prune_ratios = [per_round_prune_ratio] * len(mlp.module_list)
per_round_prune_ratios[-1] /= 2
per_round_max_iter = int(args.max_iter / args.prune_iter)
for prune_it in range(args.prune_iter):
train(
mlp,
dataloader_train,
loss_inst,
optimizer,
max_iter=per_round_max_iter,
)
prune_mlp(mlp, per_round_prune_ratios, method=args.prune_method)
copy_weights_mlp(mlp_copy, mlp)
stats = compute_stats(mlp)
for name, stat in stats.items():
summary_name = f"{name}_pruneiter={prune_it}"
wandb.run.summary[summary_name] = stat
if args.reinitialize == "true":
reinit_mlp(mlp)
# Run actual training with a final pruned network
train(
mlp,
dataloader_train,
loss_inst,
optimizer,
max_iter=args.max_iter,
dataloader_val=dataloader_val,
val_freq=args.val_freq,
)
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/lottery/parallel_launch.sh
================================================
# Parallel parameters
N_JOBS=4
ARGS="-P$N_JOBS --header :" # arguments for parallel
# ARGS="--bar "$ARGS
ARGS="--dry-run "$ARGS
# Experiment parameters
ENTITY='mildlyoverfitted'
PROJECT='lottery_parallel_2' # it should already exist to avoid issues
MAX_ITERS=(15000)
PRUNE_ITERS=(1 5)
PRUNE_METHODS=('l1' 'random')
PRUNE_RATIOS=(0 0.1 0.25 0.5 0.8 0.9 0.93 0.97)
REINITIALIZES=('true' 'false')
RANDOM_STATES=(1 2 3 4 5)
parallel $ARGS \
python main.py \
--max-iter={max_iter} \
--prune-iter={prune_iter} \
--prune-method={prune_method} \
--prune-ratio={prune_ratio} \
--random-state={random_state} \
--reinitialize={reinitialize} \
--wandb-entity=$ENTITY \
--wandb-project=$PROJECT \
::: max_iter "${MAX_ITERS[@]}" \
::: prune_iter "${PRUNE_ITERS[@]}" \
::: prune_method "${PRUNE_METHODS[@]}" \
::: prune_ratio "${PRUNE_RATIOS[@]}" \
::: random_state "${RANDOM_STATES[@]}" \
::: reinitialize "${REINITIALIZES[@]}" \
================================================
FILE: github_adventures/lottery/requirements.txt
================================================
numpy
pillow
six
torch
torch-vision
tqdm
wandb
================================================
FILE: github_adventures/lottery/utils.py
================================================
import math
import torch
import torch.nn as nn
from torch.nn.utils.prune import l1_unstructured, random_unstructured
class MLP(nn.Module):
"""Multilayer perceptron.
The bias is included in all linear layers.
Parameters
----------
n_features : int
Number of input features (pixels inside of MNIST images).
hidden_layer_sizes : tuple
Tuple of ints representing sizes of the hidden layers.
n_targets : int
Number of target classes (10 for MNIST).
Attributes
----------
module_list : nn.ModuleList
List holding all the linear layers in the right order.
"""
def __init__(self, n_features, hidden_layer_sizes, n_targets):
super().__init__()
layer_sizes = (n_features,) + hidden_layer_sizes + (n_targets,)
layer_list = []
for i in range(len(layer_sizes) - 1):
layer_list.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
self.module_list = nn.ModuleList(layer_list)
def forward(self, x):
"""Run the forward pass.
Parameters
----------
x : torch.Tensor
Batch of features of shape `(batch_size, n_features)`.
Returns
-------
torch.Tensor
Batch of predictions (logits) of shape `(batch_size, n_targets)`.
"""
n_layers = len(self.module_list)
for i, layer in enumerate(self.module_list):
x = layer(x)
if i < n_layers - 1:
x = nn.functional.relu(x)
return x
def prune_linear(linear, prune_ratio=0.3, method="l1"):
"""Prune a linear layer.
Modifies the module in-place. We make an assumption that the bias
is included.
Parameters
----------
linear : nn.Linear
Linear module containing a bias.
prune_ratio : float
Number between 0 and 1 representing the percentage of weights
to prune.
method : str, {"l1", "random"}
Pruning method to use.
"""
if method == "l1":
prune_func = l1_unstructured
elif method == "random":
prune_func = random_unstructured
else:
raise ValueError
prune_func(linear, "weight", prune_ratio)
prune_func(linear, "bias", prune_ratio)
def prune_mlp(mlp, prune_ratio=0.3, method="l1"):
"""Prune each layer of the multilayer perceptron.
Modifies the module in-place. We make an assumption that each
linear layer has the bias included.
Parameters
----------
mlp : MLP
Multilayer perceptron instance.
prune_ratio : float or list
Number between 0 and 1 representing the percentage of weights
to prune. If `list` then different ratio for each
layer.
method : str, {"l1", "random"}
Pruning method to use.
"""
if isinstance(prune_ratio, float):
prune_ratios = [prune_ratio] * len(mlp.module_list)
elif isinstance(prune_ratio, list):
if len(prune_ratio) != len(mlp.module_list):
raise ValueError("Incompatible number of prune ratios provided")
prune_ratios = prune_ratio
else:
raise TypeError
for prune_ratio, linear in zip(prune_ratios, mlp.module_list):
prune_linear(linear, prune_ratio=prune_ratio, method=method)
def check_pruned_linear(linear):
"""Check if a Linear module was pruned.
We require both the bias and the weight to be pruned.
Parameters
----------
linear : nn.Linear
Linear module containing a bias.
Returns
-------
bool
True if the model has been pruned.
"""
params = {param_name for param_name, _ in linear.named_parameters()}
expected_params = {"weight_orig", "bias_orig"}
return params == expected_params
def reinit_linear(linear):
"""Reinitialize a linear layer.
This is an in-place operation.
If the module has some pruning logic we are not going to remove it
and we only initialize the underlying tensors - `weight_orig` and
`bias_orig`.
Parameters
----------
linear : nn.Linear
Linear model containing a bias.
"""
is_pruned = check_pruned_linear(linear)
# Get parameters of interest
if is_pruned:
weight = linear.weight_orig
bias = linear.bias_orig
else:
weight = linear.weight
bias = linear.bias
# Initialize weight
nn.init.kaiming_uniform_(weight, a=math.sqrt(5))
# Initialize bias
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weight)
bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
nn.init.uniform_(bias, -bound, bound)
def reinit_mlp(mlp):
"""Reinitialize all layers of the MLP.
Parameters
----------
mlp : MLP
Multi-layer perceptron.
"""
for linear in mlp.module_list:
reinit_linear(linear)
def copy_weights_linear(linear_unpruned, linear_pruned):
"""Copy weights from an unpruned model to a pruned model.
Modifies `linear_pruned` in place.
Parameters
----------
linear_unpruned : nn.Linear
Linear model with a bias that was not pruned.
linear_pruned : nn.Linear
Linear model with a bias that was pruned.
"""
assert check_pruned_linear(linear_pruned)
assert not check_pruned_linear(linear_unpruned)
with torch.no_grad():
linear_pruned.weight_orig.copy_(linear_unpruned.weight)
linear_pruned.bias_orig.copy_(linear_unpruned.bias)
def copy_weights_mlp(mlp_unpruned, mlp_pruned):
"""Copy weights of an unpruned network to a pruned network.
Modifies `mlp_pruned` in place.
Parameters
----------
mlp_unpruned : MLP
MLP model that was not pruned.
mlp_pruned : MLP
MLP model that was pruned.
"""
zipped = zip(mlp_unpruned.module_list, mlp_pruned.module_list)
for linear_unpruned, linear_pruned in zipped:
copy_weights_linear(linear_unpruned, linear_pruned)
def compute_stats(mlp):
"""Compute important statistics related to pruning.
Parameters
----------
mlp : MLP
Multilayer perceptron.
Returns
-------
dict
Statistics.
"""
stats = {}
total_params = 0
total_pruned_params = 0
for layer_ix, linear in enumerate(mlp.module_list):
assert check_pruned_linear(linear)
weight_mask = linear.weight_mask
bias_mask = linear.bias_mask
params = weight_mask.numel() + bias_mask.numel()
pruned_params = (weight_mask == 0).sum() + (bias_mask == 0).sum()
total_params += params
total_pruned_params += pruned_params
stats[f"layer{layer_ix}_total_params"] = params
stats[f"layer{layer_ix}_pruned_params"] = pruned_params
stats[f"layer{layer_ix}_actual_prune_ratio"] = pruned_params / params
stats["total_params"] = total_params
stats["total_pruned_params"] = total_pruned_params
stats["actual_prune_ratio"] = total_pruned_params / total_params
return stats
================================================
FILE: github_adventures/mixer/README.md
================================================
Note that the `official.py` is just a copy of the
code provided in `https://arxiv.org/abs/2105.01601` and probably here
`https://github.com/google-research/vision_transformer`. Please refer to those
sources for licensing information.
================================================
FILE: github_adventures/mixer/official.py
================================================
import einops
import flax.linen as nn
import jax.numpy as jnp
class MlpBlock(nn.Module):
mlp_dim: int
@nn.compact
def __call__(self, x):
y = nn.Dense(self.mlp_dim)(x)
y = nn.gelu(y)
return nn.Dense(x.shape[-1])(y)
class MixerBlock(nn.Module):
tokens_mlp_dim: int
channels_mlp_dim: int
@nn.compact
def __call__(self, x):
y = nn.LayerNorm()(x) # (n_samples, n_patches, hidden_dim)
y = jnp.swapaxes(y, 1, 2)
y = MlpBlock(self.tokens_mlp_dim, name="token_mixing")(y)
y = jnp.swapaxes(y, 1, 2)
x = x + y
y = nn.LayerNorm()(x)
return x + MlpBlock(self.channels_mlp_dim, name="channel_mixing")(y)
class MlpMixer(nn.Module):
num_classes: int
num_blocks: int
patch_size: int
hidden_dim: int
tokens_mlp_dim: int
channels_mlp_dim: int
@nn.compact
def __call__(self, x):
s = self.patch_size
x = nn.Conv(self.hidden_dim, (s, s), strides=(s, s), name="stem")(x)
x = einops.rearrange(x, "n h w c -> n (h w) c")
for _ in range(self.num_blocks):
x = MixerBlock(self.tokens_mlp_dim, self.channels_mlp_dim)(x)
x = nn.LayerNorm(name="pre_head_layer_norm")(x)
x = jnp.mean(x, axis=1)
return nn.Dense(
self.num_classes, name="head", kernel_init=nn.initializers.zeros
)(x)
================================================
FILE: github_adventures/mixer/ours.py
================================================
import einops
import torch.nn as nn
class MlpBlock(nn.Module):
"""Multilayer perceptron.
Parameters
----------
dim : int
Input and output dimension of the entire block. Inside of the mixer
it will either be equal to `n_patches` or `hidden_dim`.
mlp_dim : int
Dimension of the hidden layer.
Attributes
----------
linear_1, linear_2 : nn.Linear
Linear layers.
activation : nn.GELU
Activation.
"""
def __init__(self, dim, mlp_dim=None):
super().__init__()
mlp_dim = dim if mlp_dim is None else mlp_dim
self.linear_1 = nn.Linear(dim, mlp_dim)
self.activation = nn.GELU()
self.linear_2 = nn.Linear(mlp_dim, dim)
def forward(self, x):
"""Run the forward pass.
Parameters
----------
x : torch.Tensor
Input tensor of shape `(n_samples, n_channels, n_patches)` or
`(n_samples, n_patches, n_channels)`.
Returns
-------
torch.Tensor
Output tensor that has exactly the same shape as the input `x`.
"""
x = self.linear_1(x) # (n_samples, *, mlp_dim)
x = self.activation(x) # (n_samples, *, mlp_dim)
x = self.linear_2(x) # (n_samples, *, dim)
return x
class MixerBlock(nn.Module):
"""Mixer block that contains two `MlpBlock`s and two `LayerNorm`s.
Parameters
----------
n_patches : int
Number of patches the image is split up into.
hidden_dim : int
Dimensionality of patch embeddings.
tokens_mlp_dim : int
Hidden dimension for the `MlpBlock` when doing token mixing.
channels_mlp_dim : int
Hidden dimension for the `MlpBlock` when doing channel mixing.
Attributes
----------
norm_1, norm_2 : nn.LayerNorm
Layer normalization.
token_mlp_block : MlpBlock
Token mixing MLP.
channel_mlp_block : MlpBlock
Channel mixing MLP.
"""
def __init__(
self, *, n_patches, hidden_dim, tokens_mlp_dim, channels_mlp_dim
):
super().__init__()
self.norm_1 = nn.LayerNorm(hidden_dim)
self.norm_2 = nn.LayerNorm(hidden_dim)
self.token_mlp_block = MlpBlock(n_patches, tokens_mlp_dim)
self.channel_mlp_block = MlpBlock(hidden_dim, channels_mlp_dim)
def forward(self, x):
"""Run the forward pass.
Parameters
----------
x : torch.Tensor
Tensor of shape `(n_samples, n_patches, hidden_dim)`.
Returns
-------
torch.Tensor
Tensor of the same shape as `x`, i.e.
`(n_samples, n_patches, hidden_dim)`.
"""
y = self.norm_1(x) # (n_samples, n_patches, hidden_dim)
y = y.permute(0, 2, 1) # (n_samples, hidden_dim, n_patches)
y = self.token_mlp_block(y) # (n_samples, hidden_dim, n_patches)
y = y.permute(0, 2, 1) # (n_samples, n_patches, hidden_dim)
x = x + y # (n_samples, n_patches, hidden_dim)
y = self.norm_2(x) # (n_samples, n_patches, hidden_dim)
res = x + self.channel_mlp_block(
y
) # (n_samples, n_patches, hidden_dim)
return res
class MlpMixer(nn.Module):
"""Entire network.
Parameters
----------
image_size : int
Height and width (assuming it is a square) of the input image.
patch_size : int
Height and width (assuming it is a square) of the patches. Note
that we assume that `image_size % patch_size == 0`.
tokens_mlp_dim : int
Hidden dimension for the `MlpBlock` when doing the token mixing.
channels_mlp_dim : int
Hidden dimension for the `MlpBlock` when diong the channel mixing.
n_classes : int
Number of classes for classification.
hidden_dim : int
Dimensionality of patch embeddings.
n_blocks : int
The number of `MixerBlock`s in the architecture.
Attributes
----------
patch_embedder : nn.Conv2D
Splits the image up into multiple patches and then embeds each of them
(using shared weights).
blocks : nn.ModuleList
List of `MixerBlock` instances.
pre_head_norm : nn.LayerNorm
Layer normalization applied just before the classification head.
head_classifier : nn.Linear
The classification head.
"""
def __init__(
self,
*,
image_size,
patch_size,
tokens_mlp_dim,
channels_mlp_dim,
n_classes,
hidden_dim,
n_blocks,
):
super().__init__()
n_patches = (image_size // patch_size) ** 2 # assumes divisibility
self.patch_embedder = nn.Conv2d(
3,
hidden_dim,
kernel_size=patch_size,
stride=patch_size,
)
self.blocks = nn.ModuleList(
[
MixerBlock(
n_patches=n_patches,
hidden_dim=hidden_dim,
tokens_mlp_dim=tokens_mlp_dim,
channels_mlp_dim=channels_mlp_dim,
)
for _ in range(n_blocks)
]
)
self.pre_head_norm = nn.LayerNorm(hidden_dim)
self.head_classifier = nn.Linear(hidden_dim, n_classes)
def forward(self, x):
"""Run the forward pass.
Parameters
----------
x : torch.Tensor
Input batch of square images of shape
`(n_samples, n_channels, image_size, image_size)`.
Returns
-------
torch.Tensor
Class logits of shape `(n_samples, n_classes)`.
"""
x = self.patch_embedder(
x
) # (n_samples, hidden_dim, n_patches ** (1/2), n_patches ** (1/2))
x = einops.rearrange(
x, "n c h w -> n (h w) c"
) # (n_samples, n_patches, hidden_dim)
for mixer_block in self.blocks:
x = mixer_block(x) # (n_samples, n_patches, hidden_dim)
x = self.pre_head_norm(x) # (n_samples, n_patches, hidden_dim)
x = x.mean(dim=1) # (n_samples, hidden_dim)
y = self.head_classifier(x) # (n_samples, n_classes)
return y
================================================
FILE: github_adventures/mixer/test_compare.py
================================================
import jax
import numpy as np
import pytest
import torch
from official import MlpMixer as OfficialMixer
from ours import MlpMixer as OurMixer
@pytest.mark.parametrize("image_size", [6, 12])
@pytest.mark.parametrize("patch_size", [2, 3])
@pytest.mark.parametrize("hidden_dim", [4, 5])
@pytest.mark.parametrize("n_blocks", [1, 2])
@pytest.mark.parametrize("n_classes", [4, 8])
@pytest.mark.parametrize("tokens_mlp_dim", [2, 4])
@pytest.mark.parametrize("channels_mlp_dim", [3, 6])
def test_compare(
image_size,
patch_size,
hidden_dim,
n_blocks,
n_classes,
tokens_mlp_dim,
channels_mlp_dim,
):
# Create Flax model
model_flax = OfficialMixer(
num_classes=n_classes,
num_blocks=n_blocks,
patch_size=patch_size,
hidden_dim=hidden_dim,
tokens_mlp_dim=tokens_mlp_dim,
channels_mlp_dim=channels_mlp_dim,
)
key1, key2 = jax.random.split(jax.random.PRNGKey(0))
x = jax.random.normal(key1, (11, image_size, image_size, 3)) # Dummy input
params = model_flax.init(key2, x) # initialization call
n_params_flax = sum(
jax.tree_leaves(jax.tree_map(lambda x: np.prod(x.shape), params))
)
shape_flax = model_flax.apply(params, x).shape
# Create Torch model
model_torch = OurMixer(
image_size=image_size,
patch_size=patch_size,
hidden_dim=hidden_dim,
n_blocks=n_blocks,
n_classes=n_classes,
tokens_mlp_dim=tokens_mlp_dim,
channels_mlp_dim=channels_mlp_dim,
)
n_params_torch = sum(
p.numel() for p in model_torch.parameters() if p.requires_grad
)
shape_torch = model_torch(torch.rand(11, 3, image_size, image_size)).shape
assert n_params_flax == n_params_torch
assert shape_flax == shape_torch == (11, n_classes)
================================================
FILE: github_adventures/mixup/launch_experiments.sh
================================================
set -x
N_EPOCHS=100000
N_SAMPLES=1000
SEED=123
TBOARD_DIR=tb_results/$SEED
python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/no_regularization
python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/weight_decay --weight-decay 0.6
python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/dropout -p 0.2
python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/mixup --mixup
python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/input_mixup -k 0 1 --mixup
python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/hidden_layers_mixup -k 1 4 --mixup
================================================
FILE: github_adventures/mixup/train.py
================================================
import argparse
import json
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from utils import (
CustomDataset,
MLPClassifierMixup,
generate_prediction_img,
generate_spirals,
)
def main(argv=None):
parser = argparse.ArgumentParser("Training")
# Parameters
parser.add_argument(
"logpath",
type=str,
)
parser.add_argument(
"-b",
"--batch-size",
type=int,
default=32,
help="Batch size",
)
parser.add_argument(
"--mixup",
action="store_true",
)
parser.add_argument(
"-p",
"--dropout-probability",
type=float,
default=0,
help="The probability of dropout",
)
parser.add_argument(
"--hidden-dims",
nargs="+",
type=int,
default=(32, 32, 32),
help="Hidden dimensions of the MLP",
)
parser.add_argument(
"-c",
"--n-cycles",
type=float,
default=2,
help="Number of cycles when creating the spiral dataset",
)
parser.add_argument(
"-n",
"--n-epochs",
type=int,
default=100,
help="Number of epochs",
)
parser.add_argument(
"-k",
"--mixing-layer",
type=int,
nargs=2,
default=(None, None),
help="The range of k to sample from",
)
parser.add_argument(
"-s",
"--n-samples",
type=int,
default=1000,
help="Number of samples",
)
parser.add_argument(
"-r",
"--random-state",
type=int,
default=5,
help="Random state",
)
parser.add_argument(
"--weight-decay",
type=float,
default=0.0,
help="Weight decay",
)
args = parser.parse_args(argv)
device = torch.device("cpu")
dtype = torch.float32
np.random.seed(args.random_state)
torch.manual_seed(args.random_state)
# Dataset preparation
X, y = generate_spirals(
args.n_samples,
noise_std=0,
n_cycles=args.n_cycles,
)
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.9,
shuffle=True,
stratify=y,
)
X_test_t = torch.from_numpy(X_test).to(device, dtype)
dataset_train = CustomDataset(X_train, y_train)
dataloader_train = DataLoader(
dataset_train,
batch_size=2 * args.batch_size,
drop_last=True,
shuffle=True,
)
# Model and loss definition
model = MLPClassifierMixup(
n_features=2,
hidden_dims=tuple(args.hidden_dims),
p=args.dropout_probability,
)
model.to(device, dtype)
optimizer = torch.optim.AdamW(
model.parameters(),
weight_decay=args.weight_decay,
)
loss_fn = torch.nn.BCEWithLogitsLoss()
# Summary
writer = SummaryWriter(args.logpath)
writer.add_text("hparams", json.dumps(vars(args)))
# Training + evaluation loop
bs = args.batch_size
n_steps = 0
for e in range(args.n_epochs):
for X_batch, y_batch in dataloader_train:
X_batch, y_batch = X_batch.to(device, dtype), y_batch.to(
device, dtype
)
if args.mixup:
k_min, k_max = args.mixing_layer
k_min = k_min or 0
k_max = k_max or model.n_hidden + 1
k = np.random.randint(k_min, k_max)
lam = np.random.beta(2, 2)
writer.add_scalar("k", k, n_steps)
writer.add_scalar("lambda", lam, n_steps)
h = model(X_batch, start=0, end=k) # (2 * batch_size, *)
h_mixed = lam * h[:bs] + (1 - lam) * h[bs:] # (batch_size, *)
y_mixed = lam * y_batch[:bs] + (1 - lam) * y_batch[bs:] # (batch_size,)
logits = model(h_mixed, start=k, end=None) # (batch_size, 1)
loss = loss_fn(logits.squeeze(), y_mixed)
else:
logits = model(X_batch[:bs]) # (batch_size, 1)
loss = loss_fn(logits.squeeze(), y_batch[:bs])
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Logging
writer.add_scalar("loss_train", loss, n_steps)
if n_steps % 2500 == 0:
model.eval()
fig_gen = generate_prediction_img(
model,
X_train,
X_test,
y_train,
y_test,
)
writer.add_figure("test", next(fig_gen))
writer.add_figure("contour", next(fig_gen), n_steps)
writer.add_figure("contour_train", next(fig_gen), n_steps)
with torch.no_grad():
logits_test = model(X_test_t).squeeze().detach().cpu()
acc_test = (
torch.sigmoid(logits_test).round().numpy() == y_test
).sum() / len(y_test)
loss_test = loss_fn(logits_test, torch.from_numpy(y_test))
writer.add_scalar("loss_test", loss_test, n_steps)
writer.add_scalar("accuracy_test", acc_test, n_steps)
model.train()
n_steps += 1
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/mixup/utils.py
================================================
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
from matplotlib.colors import ListedColormap
from torch.utils.data import Dataset
class MLPClassifierMixup(nn.Module):
"""Multilayer perceptron with inbuilt mixup logic.
Assuming binary classification.
Parameters
----------
n_features : int
Number of features.
hidden_dims : tuple
The sizes of the hidden layers.
p : float
Dropout probability.
Attributes
----------
hidden_layers : nn.ModuleList
List of hidden layers that are each composed of a `Linear`,
`LeakyReLU` and `Dropout` modules.
n_hidden : int
Number of hidden layers.
clf : nn.Linear
The classifier at the end of the pipeline.
"""
def __init__(self, n_features, hidden_dims, p=0):
super().__init__()
dims = (n_features,) + hidden_dims
self.n_hidden = len(hidden_dims)
self.hidden_layers = nn.ModuleList(
[
nn.Sequential(
nn.Linear(dims[i], dims[i + 1]),
nn.LeakyReLU(0.2),
nn.Dropout(p),
)
for i in range(self.n_hidden)
]
)
self.clf = nn.Linear(dims[-1], 1)
def forward(self, x, start=0, end=None):
"""Run forward pass.
Parameters
----------
x : torch.Tensor
Input of shape `(n_samples, dim)`. Note that the dim
will depend on `start`.
start : int
The hidden layer where the forward pass starts (inclusive). We
use a convention of `start=0` and `end=0` as a noop and the input
tensor is returned. Useful for implementing input mixing.
end : int or None
The ending hidden layer (exclusive). If None, then always run until
the last hidden layer and then we also apply the classifier.
"""
for module in self.hidden_layers[start:end]:
x = module(x)
if end is None:
x = self.clf(x)
return x
class CustomDataset(Dataset):
"""Custom classification dataset assuming we have X and y loaded in memory.
Parameters
----------
X : np.ndarray
Features of shape `(n_samples, n_features)`.
y : np.ndarray
Targets of shape `(n_samples,)`.
"""
def __init__(self, X, y):
if len(X) != len(y):
raise ValueError("Inconsistent number of samples")
classes = np.unique(y)
if not np.array_equal(np.sort(classes), np.array([0, 1])):
raise ValueError
self.X = X
self.y = y
def __len__(self):
"""Compute the length of the dataset."""
return len(self.X)
def __getitem__(self, ix):
"""Return a single sample."""
return self.X[ix], self.y[ix]
def generate_spirals(
n_samples,
noise_std=0.05,
n_cycles=2,
random_state=None,
):
"""Generate two spirals dataset.
Parameters
----------
n_samples : int
Number of samples to generate. For simplicity, an even number
is required. The targets (2 spirals) are perfectly balanced.
noise_std : float
Standard deviation of the noise added to the spirals.
n_cycles : int
Number of revolutions the spirals make.
random_state : int or None
Controls randomness.
Returns
-------
X : np.ndarray
Features of shape `(n_samples, n_features)`.
y : np.ndarray
Targets of shape `(n_samples,)`. There are two
classes 0 and 1 representing the two spirals.
"""
if n_samples % 2 != 0:
raise ValueError("The number of samples needs to be even")
n_samples_per_class = int(n_samples // 2)
angle_1 = np.linspace(0, n_cycles * 2 * np.pi, n_samples_per_class)
angle_2 = np.pi + angle_1
radius = np.linspace(0.2, 2, n_samples_per_class)
x_1 = radius * np.cos(angle_1)
y_1 = radius * np.sin(angle_1)
x_2 = radius * np.cos(angle_2)
y_2 = radius * np.sin(angle_2)
X = np.concatenate(
[
np.stack([x_1, y_1], axis=1),
np.stack([x_2, y_2], axis=1),
],
axis=0,
)
y = np.zeros((n_samples,))
y[n_samples_per_class:] = 1.0
if random_state is not None:
np.random.seed(random_state)
new_ixs = np.random.permutation(n_samples)
X = X[new_ixs] + np.random.normal(
loc=0, scale=noise_std, size=(n_samples, 2)
)
y = y[new_ixs]
return X, y
def generate_prediction_img(
model,
X_train,
X_test,
y_train,
y_test,
):
"""Generate contour and scatter plots with predictions.
Parameters
----------
model : MLPClassifierMixup
Instance of a multilayer-perceptron.
X_train, X_test : np.ndarray
Trand and test features of shape `(n_samples, n_features)`.
y_train, y_test : np.ndarray
Train and test targets of shape `(n_samples,)`.
Yields
------
matplotlib.Figure
Different figures.
"""
device = next(model.parameters()).device
dtype = next(model.parameters()).dtype
cm = plt.cm.RdBu
cm_bright = ListedColormap(["#FF0000", "#0000FF"])
delta = 0.5
xlim = (X_test[:, 0].min() - delta, X_test[:, 0].max() + delta)
ylim = (X_test[:, 1].min() - delta, X_test[:, 1].max() + delta)
n = 50
xx, yy = np.meshgrid(
np.linspace(xlim[0], xlim[1], n),
np.linspace(ylim[0], ylim[1], n),
)
grid = np.stack([xx.ravel(), yy.ravel()], axis=1)
with torch.no_grad():
logits = model(torch.from_numpy(grid).to(device, dtype))
probs = torch.sigmoid(logits)[:, 0].detach().cpu().numpy()
probs = probs.reshape(xx.shape)
fig, ax = plt.subplots(1, 1, dpi=170)
ax.scatter(
X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors="k"
)
ax.set_title("Test data")
yield fig
ax.cla()
ax.contourf(xx, yy, probs, cmap=cm, alpha=0.8)
ax.set_title("Prediction contours")
yield fig
ax.scatter(
X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
)
ax.set_title("Train data + prediction contours")
yield fig
================================================
FILE: github_adventures/ner_evaluation/README.md
================================================
* https://github.com/huggingface/evaluate/blob/af3c30561d840b83e54fc5f7150ea58046d6af69/metrics/seqeval/seqeval.py#L120
* https://github.com/chakki-works/seqeval/blob/cd01b5210eaa65e691c22320aba56f2be9e9fc43/seqeval/metrics/sequence_labeling.py#L1
================================================
FILE: github_adventures/ner_evaluation/ours.py
================================================
import re
import pandas as pd
from sklearn.metrics import classification_report
def check_valid(annots: list[str]) -> bool:
allowed_pattern = re.compile(r"^(O$|B-.+$|I-.+$)")
annots = ["O"] + annots
n = len(annots)
if any(allowed_pattern.match(annot) is None for annot in annots):
return False
for i in range(1, n):
annot = annots[i]
if annot.startswith("I-"):
if annots[i - 1] == "O" or annots[i - 1][2:] != annot[2:]:
return False
return True
def get_etypes(annots: list[str]) -> list[None | str]:
return [annot[2:] if annot != "O" else None for annot in annots]
def get_entities(annots: list[str]) -> list[dict[str, int | str]]:
if not check_valid(annots):
raise ValueError("Invalid input.")
annots = ["O"] + annots + ["O"]
etypes = get_etypes(annots)
n = len(annots)
start_patterns = {
("O", "B-"), # ["O", "B-LOC"]
("B-", "B-"), # ["B-PERSON", "B-LOC"]
("I-", "B-"), # ["B-LOC", "I-LOC", "B-PERSON"]
}
end_patterns = {
("I-", "O"), # ["B-LOC", "I-LOC", "O"]
("B-", "O"), # ["B-LOC", "O"]
("B-", "B-"), # ["B-PERSON", "B-LOC"]
("I-", "B-"), # ["B-LOC", "I-LOC", "B-PERSON"]
}
entities: list[dict[str, int | str]] = []
i = 1
start = None
while i < n:
prev, curr = annots[i - 1], annots[i]
pattern = (prev[:2], curr[:2])
if pattern in end_patterns and start is not None:
entities.append(
{
"start": start - 1,
"end": i - 2,
"etype": etypes[i - 1],
}
)
start = None
if pattern in start_patterns:
start = i
i += 1
return entities
def get_report(annots_true: list[str], annots_pred: list[str]) -> dict:
if len(annots_true) != len(annots_pred):
raise ValueError("Unequal lengths")
entities_true = pd.DataFrame(get_entities(annots_true))
entities_pred = pd.DataFrame(get_entities(annots_pred))
entities_true = entities_true.rename(columns={"etype": "etype_true"})
entities_pred = entities_pred.rename(columns={"etype": "etype_pred"})
df_merge = entities_true.merge(entities_pred, on=["start", "end"], how="outer")
df = df_merge.fillna("")
labels = (set(df["etype_true"].tolist()) | set(df["etype_pred"].tolist())) - {""}
report = classification_report(
df["etype_true"],
df["etype_pred"],
output_dict=True,
labels=list(labels),
)
return report
================================================
FILE: github_adventures/ner_evaluation/test_ours.py
================================================
import pytest
from seqeval.metrics import classification_report as cr
from seqeval.scheme import IOB2
from ours import check_valid, get_entities, get_etypes, get_report
@pytest.mark.parametrize(
"inp,out",
[
([], True),
(["NONSENSE", "O"], False),
(["O", "O", "O"], True),
(["B-"], False),
(["O", "I-ORG", "O"], False),
(["O", "B-ORG", "I-PERSON"], False),
(["O", "B-ORG", "B-PERSON"], True),
(["O", "SOMETHING", "B-PERSON"], False),
(["O-", "O", "O"], False),
(["B-A", "O", "B-T"], True),
(["I-a", "B-a", "B-a", "I-a", "I-a", "O"], False),
],
)
def test_check_valid(inp, out):
assert check_valid(inp) == out
@pytest.mark.parametrize(
"inp,out",
[
([], []),
(["O", "O", "O"], [None, None, None]),
(["O", "B-ORG", "O"], [None, "ORG", None]),
(["O", "B-ORG", "B-ORG"], [None, "ORG", "ORG"]),
(["O", "B-PERSON", "I-PERSON"], [None, "PERSON", "PERSON"]),
(["B-A", "O", "B-T"], ["A", None, "T"]),
],
)
def test_get_etypes(inp, out):
assert get_etypes(inp) == out
@pytest.mark.parametrize(
"inp,out",
[
(["O", "O", "O"], []),
(["O", "B-ORG", "O"], [{"start": 1, "end": 1, "etype": "ORG"}]),
(
["O", "B-ORG", "B-ORG"],
[
{"start": 1, "end": 1, "etype": "ORG"},
{"start": 2, "end": 2, "etype": "ORG"},
],
),
(["O", "B-PERSON", "I-PERSON"], [{"start": 1, "end": 2, "etype": "PERSON"}]),
(
["B-A", "O", "B-T"],
[
{"start": 0, "end": 0, "etype": "A"},
{"start": 2, "end": 2, "etype": "T"},
],
),
(["B-LOC", "I-LOC", "I-LOC"], [{"start": 0, "end": 2, "etype": "LOC"}]),
(
["B-A", "I-A", "B-T"],
[
{"start": 0, "end": 1, "etype": "A"},
{"start": 2, "end": 2, "etype": "T"},
],
),
],
)
def test_get_entities(inp, out):
assert get_entities(inp) == out
@pytest.mark.parametrize(
"annots_true,annots_pred",
[
(
["O", "B-PERSON", "I-PERSON", "O"],
["O", "B-PERSON", "I-PERSON", "O"],
),
(
["O", "B-PERSON", "I-PERSON", "B-LOC"],
["O", "B-PERSON", "I-PERSON", "O"],
),
(
["O", "B-PERSON", "I-PERSON", "O"],
["O", "O", "B-PERSON", "O"],
),
(
["O", "B-PERSON", "I-PERSON", "O"],
["O", "O", "B-PERSON", "O"],
),
(
["B-PERSON", "B-LOC", "I-LOC", "B-DATE"],
["B-PERSON", "B-DATE", "B-PERSON", "B-DATE"],
),
(
["B-PERSON", "I-PERSON", "I-PERSON", "O", "O", "B-LOC", "B-DATE"],
["B-PERSON", "I-PERSON", "I-PERSON", "O", "O", "B-LOC", "B-DATE"],
),
(
["B-PERSON", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-LOC"],
["B-PERSON", "O", "B-DATE", "O", "B-LOC", "I-LOC", "I-LOC", "I-LOC"],
),
(
["B-PERSON", "I-PERSON", "O", "B-LOC", "I-LOC", "O", "B-PERSON", "B-PERSON", "B-LOC"],
["B-PERSON", "I-PERSON", "O", "B-LOC", "B-LOC", "O", "B-PERSON", "B-PERSON", "B-LOC"],
),
]
)
def test_get_report(annots_true, annots_pred):
report = get_report(annots_true, annots_pred)
seqeval_report = cr([annots_true], [annots_pred], scheme=IOB2, mode="strict", output_dict=True)
keys_to_delete = {"accuracy", "micro avg"}
for rep in (report, seqeval_report):
for key in keys_to_delete:
try:
rep.pop(key)
except KeyError:
pass
assert report == seqeval_report
================================================
FILE: github_adventures/ner_evaluation/try.py
================================================
import pprint
import evaluate
metric = evaluate.load("seqeval")
# Tom Cruise is great
annots_true = ["B-PERSON", "I-PERSON", "O", "O"]
# annots_pred = ["B-PERSON", "I-PERSON", "O", "O"]
# annots_pred = ["O", "O", "O", "O"]
# annots_pred = ["B-PERSON", "O", "O", "O"]
annots_pred = ["B-LOCATION", "I-LOCATION", "O", "O"]
result = metric.compute(references=[annots_true], predictions=[annots_pred])
pprint.pprint(result)
================================================
FILE: github_adventures/neuron/README.md
================================================
# Installation
```bash
pip install -r requirements.txt
```
# Running training
To run the same experiments as in the video run
```bash
./launch.sh
```
However, feel free to check the contents of the `launch.sh` for single
experiments.
# Evaluation and pretrained models
This repo contains multiple pretrained models inside of `pretrained/`. They
are all `.pkl` files and they were created by pickling `solutions.Solution`
subclasses. To load them inside of Python run something along these lines
```python
import pickle
solution_path = "pretrained/invariant_ours.pkl" # you can change this
with open(solution_path, "rb") as f:
solution = pickle.load(f)[0]
```
You can also run any of the below scripts to reproduce the results from
the end of the video.
```bash
EPISODES=30
python evaluate_shuffling.py -e $EPISODES
python evaluate_noise.py -e $EPISODES
python evaluate_video.py -e $EPISODES
```
================================================
FILE: github_adventures/neuron/evaluate_noise.py
================================================
"""Assumes you have already trained your model and you have a checkpoint."""
import argparse
import pathlib
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tasks import IncompatibleNFeatures, Task
def main(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
"-e",
"--n-episodes",
type=int,
default=200,
)
args = parser.parse_args(argv)
# Prepare solutions and tasks
checkpoint_path = pathlib.Path("pretrained") / "invariant_official.pkl"
assert checkpoint_path.exists()
with checkpoint_path.open("rb") as f:
obj = pickle.load(f)
if len(obj) == 1:
solution_inst = obj[0]
elif len(obj) == 2:
solver, solution_inst = obj
solution_inst.set_params(solver.result.xfavorite)
else:
raise ValueError
results = []
for n_noise_features in range(0, 30, 5):
for shuffle in [True, False]:
print(f"{n_noise_features=}, {shuffle=}")
task = Task(
render=False,
n_noise_features=n_noise_features,
shuffle_on_reset=shuffle,
env_seed=None,
feature_seed=None,
)
for episode_ix in range(args.n_episodes):
reward = task.rollout(solution_inst)
results.append(
{
"n_noise_features": n_noise_features,
"shuffle": shuffle,
"episode_ix": episode_ix,
"reward": reward,
}
)
results_df = pd.DataFrame(results)
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=300)
sns.violinplot(
data=results_df,
x="n_noise_features",
y="reward",
hue="shuffle",
split=True,
inner="quart",
linewidth=1,
palette="muted",
ax=ax,
scale="count",
)
sns.despine(left=True)
ax.set_ylim(0, 1000)
ax.grid(True)
fig.tight_layout()
fig.savefig("invariant_model_noise.png")
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/neuron/evaluate_shuffling.py
================================================
"""Assumes you have already trained your model and you have a checkpoint."""
import argparse
import pathlib
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tasks import IncompatibleNFeatures, Task
def main(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
"-e",
"--n-episodes",
type=int,
default=200,
)
args = parser.parse_args(argv)
# Prepare solutions and tasks
checkpoints = {}
checkpoint_folder = pathlib.Path("pretrained")
assert checkpoint_folder.exists()
checkpoint_paths = [
checkpoint_folder / "linear.pkl",
checkpoint_folder / "linear_augment.pkl",
checkpoint_folder / "MLP.pkl",
checkpoint_folder / "MLP_augment.pkl",
checkpoint_folder / "invariant_ours.pkl",
checkpoint_folder / "invariant_official.pkl",
]
for path in checkpoint_paths:
with path.open("rb") as f:
obj = pickle.load(f)
if len(obj) == 1:
solution_inst = obj[0]
elif len(obj) == 2:
solver, solution_inst = obj
solution_inst.set_params(solver.result.xfavorite)
else:
raise ValueError
checkpoints[path.stem] = solution_inst
results = []
for model_name, solution_inst in checkpoints.items():
for shuffle in [True, False]:
print(f"{model_name=}, {shuffle=}")
task = Task(
render=False,
n_noise_features=0,
shuffle_on_reset=shuffle,
env_seed=None,
feature_seed=None,
)
for episode_ix in range(args.n_episodes):
reward = task.rollout(solution_inst)
results.append(
{
"model": model_name,
"shuffle": shuffle,
"episode_ix": episode_ix,
"reward": reward,
}
)
results_df = pd.DataFrame(results)
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=300)
sns.violinplot(
data=results_df,
x="model",
y="reward",
hue="shuffle",
split=True,
inner="quart",
linewidth=1,
palette="muted",
ax=ax,
scale="count",
order=sorted(checkpoints.keys()),
)
sns.despine(left=True)
ax.set_ylim(0, 1000)
ax.grid(True)
fig.tight_layout()
fig.savefig("all_models_shuffling.png")
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/neuron/evaluate_video.py
================================================
"""Assumes you have already trained your model and you have a checkpoint."""
import argparse
import pathlib
import pickle
from gym.wrappers import Monitor
import matplotlib.pyplot as plt
from tasks import IncompatibleNFeatures, Task
def main(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
"-e",
"--n-episodes",
type=int,
default=2,
)
args = parser.parse_args(argv)
# Prepare solutions and tasks
checkpoints = {}
checkpoint_folder = pathlib.Path("pretrained")
assert checkpoint_folder.exists()
checkpoint_paths = [
checkpoint_folder / "linear.pkl",
checkpoint_folder / "linear_augment.pkl",
checkpoint_folder / "MLP.pkl",
checkpoint_folder / "MLP_augment.pkl",
checkpoint_folder / "invariant_ours.pkl",
checkpoint_folder / "invariant_official.pkl",
]
checkpoint_paths = checkpoint_paths
for path in checkpoint_paths:
with path.open("rb") as f:
obj = pickle.load(f)
if len(obj) == 1:
solution_inst = obj[0]
elif len(obj) == 2:
solver, solution_inst = obj
solution_inst.set_params(solver.result.xfavorite)
else:
raise ValueError
checkpoints[path.stem] = solution_inst
for model_name, solution_inst in checkpoints.items():
for shuffle in [True, False]:
for episode_ix in range(args.n_episodes):
print(f"{model_name=}, {shuffle=}")
task = Task(
render=False,
n_noise_features=0,
shuffle_on_reset=shuffle,
env_seed=None,
feature_seed=None,
)
task.env = Monitor(
task.env,
f"videos/{model_name}/{shuffle}/{episode_ix}/",
)
reward = task.rollout(solution_inst)
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/neuron/launch.sh
================================================
OUTPUT_FOLDER=log_dir
python trainer.py --max-iter 1000 linear $OUTPUT_FOLDER/linear
python trainer.py --max-iter 1000 --shuffle-on-reset linear $OUTPUT_FOLDER/linear_augment
python trainer.py --max-iter 1000 MLP $OUTPUT_FOLDER/MLP
python trainer.py --max-iter 2000 --shuffle-on-reset MLP $OUTPUT_FOLDER/MLP_augment
python trainer.py --max-iter 14000 invariant $OUTPUT_FOLDER/invariant
================================================
FILE: github_adventures/neuron/requirements.txt
================================================
cma
gym
gym-cartpole-swingup
matplotlib
numpy
pandas
seaborn
tensorboard
torch
tqdm
================================================
FILE: github_adventures/neuron/solutions.py
================================================
import abc
import numpy as np
import torch
from torch_utils import PermutationInvariantNetwork, MLP
class Solution(abc.ABC):
"""Solution abstract class.
Attributes
----------
policy : torch.nn.Module
Network that holds all the learnable parameters.
"""
@abc.abstractmethod
def clone(self, obs):
"""Create a copy of the current solution without any links to self."""
@abc.abstractmethod
def get_action(self, obs):
"""Determine the next action given the observation array."""
@abc.abstractmethod
def get_n_features(self):
"""Get the number of features expected by the model.
If None then the model can process variable-sized feature
vectors.
"""
@abc.abstractmethod
def reset(self):
"""Reset solution.
Will be called at the beginning of each rollout.
Does not mean we will "reinitialize" the weights of `policy`.
"""
def get_params(self):
"""Get learnable parameters of the solution.
Returns
-------
params : np.ndarray
1D array containing all parameters.
"""
params_l = []
for p in self.policy.parameters():
params_l.append(p.numpy().ravel())
params = np.concatenate(params_l)
return params
def set_params(self, params):
"""Set the learnable parameters.
Parameters
----------
params : np.ndarray
1D array containing all parameters.
Returns
-------
self : Solution
"""
start_ix, end_ix = 0, 0
for p in self.policy.parameters():
end_ix = start_ix + np.prod(p.shape)
p.data = torch.from_numpy(
params[start_ix:end_ix].reshape(p.shape)
).float()
start_ix = end_ix
return self
def get_n_params(self):
return len(self.get_params())
class MLPSolution(Solution):
"""Multilayer perceptron solution.
Parameters
----------
n_features : int
Number of input features.
hidden_layer_sizes : tuple
Tuple of int that defines the sizes of all hidden layers.
Attributes
----------
kwargs : dict
All parameters necessary to instantiate the class.
policy : MLP
Policy network - multilayer perceptron.
"""
def __init__(self, n_features=5, hidden_layer_sizes=(16,)):
self.kwargs = {
"n_features": n_features,
"hidden_layer_sizes": hidden_layer_sizes,
}
self.dtype = torch.float32
self.policy = MLP(n_features, hidden_layer_sizes)
self.policy.to(self.dtype)
self.policy.eval()
def clone(self):
old_policy = self.policy
new_solution = self.__class__(**self.kwargs)
new_solution.policy.load_state_dict(
old_policy.state_dict(),
)
return new_solution
def get_action(self, obs):
y = self.policy(torch.from_numpy(obs).to(self.dtype))
action = y.item()
return action
def get_n_features(self):
return self.kwargs["n_features"]
def reset(self):
pass
class PermutationInvariantSolution(Solution):
"""Permutation invariant solution.
Parameters
----------
n_embeddings : int
Number of rows in the Q tensor.
proj_dim : int
Size of the space to which we project the K and Q tensors.
hidden_size : int
Dimensionality of the Q and K tensors before linear projections.
Attributes
----------
kwargs : dict
All parameters necessary to instantiate the class
dtype : torch.dtype
Dtype of both the network weights and input features.
policy : PermutationInvariantNetwork
Policy network.
prev_action : float
Stores the previous action. Automatically updated each time we call
`get_action`.
"""
def __init__(
self,
n_embeddings=16,
proj_dim=32,
hidden_size=8,
):
self.kwargs = {
"n_embeddings": n_embeddings,
"proj_dim": proj_dim,
"hidden_size": hidden_size,
}
self.policy = PermutationInvariantNetwork(
n_embeddings=n_embeddings,
proj_dim=proj_dim,
hidden_size=hidden_size,
)
self.dtype = torch.float32
self.policy.to(self.dtype)
self.policy.eval()
self.prev_action = 0 # will be continuously updated
def clone(self):
old_policy = self.policy
new_solution = self.__class__(**self.kwargs)
new_solution.policy.load_state_dict(
old_policy.state_dict(),
)
return new_solution
def get_action(self, obs):
y = self.policy(torch.from_numpy(obs).to(self.dtype), self.prev_action)
action = y.item()
self.prev_action = action
return action
def reset(self):
self.policy.attention_neuron.hx = None
self.previous_action = 0
def get_n_features(self):
return None
================================================
FILE: github_adventures/neuron/tasks.py
================================================
import gym
import gym_cartpole_swingup # noqa has a sideffect
import numpy as np
N_ORIGINAL_FEATURES = 5
class IncompatibleNFeatures(Exception):
"""Raised when observation and model number of features does not match."""
class Task:
"""Cartpoleswingup task.
Parameters
----------
render : bool
If True, we render each step into a video frame.
shuffle_on_reset : bool
If True, the features are randomly shuffled before each rollout.
n_noise_features : int
Number of noise features added to the observation vector.
env_seed : None or int
Random state controling the underlying `gym.Env`.
feature_seed : None or int
Random state controling the shuffling and noise features.
max_episode_steps : int
Maximum number of steps per episode (=rollout). After his number
`done=True` automatically.
Attributes
----------
n_features : int
Overall number of features (original + noise).
perm_ix : np.ndarray
1D array storing a permutation indices of the features.
env : gym.Env
Environment.
rnd : RandomState
Random state.
"""
def __init__(
self,
render=False,
shuffle_on_reset=False,
n_noise_features=0,
env_seed=None,
feature_seed=None,
max_episode_steps=1000,
):
self.env = gym.make("CartPoleSwingUp-v1")
self.env._max_episode_steps = max_episode_steps
self.shuffle_on_reset = shuffle_on_reset
self.render = render
self.n_noise_features = n_noise_features
self.n_features = N_ORIGINAL_FEATURES + n_noise_features
self.perm_ix = np.arange(self.n_features)
self.noise_std = 0.1
# Set seeds
self.env.seed(env_seed)
self.rnd = np.random.RandomState(seed=feature_seed)
def reset_for_rollout(self):
"""Generate a new permutation of the features.
It is going to be called at the beginning of each episode.
Note that the permutation stays constant throughout the episode.
"""
self.perm_ix = np.arange(self.n_features)
if self.shuffle_on_reset:
self.rnd.shuffle(self.perm_ix)
def modify_obs(self, obs):
"""Modify raw observations.
Parameters
----------
obs : np.ndarray
Raw observation/feature array of shape `(5,)`.
Returns
-------
obs_modified : np.ndarray
Modified observation array of shape `(5 + n_noise_features,)`.
If `shuffle_on_reset` then the order of the features is going
to change.
"""
noise = self.rnd.randn(self.n_noise_features) * self.noise_std
obs_and_noise = np.concatenate([obs, noise], axis=0)
obs_modified = obs_and_noise[self.perm_ix]
return obs_modified
def rollout(self, solution):
"""Run a single episode/rollout.
Parameters
----------
solution : solutions.Solution
Instance of a solution that yields an action given an
observation.
Returns
-------
ep_reward : int
Overall episode reward computed as a sum of per step rewards.
"""
# sanity check
n_features_solution = solution.get_n_features()
n_features_task = self.n_features
if (
n_features_solution is not None
and n_features_solution != n_features_task
):
raise IncompatibleNFeatures
self.reset_for_rollout()
solution.reset() # important for PermutationInvariantSolution
obs = self.env.reset()
if self.render:
self.env.render()
ep_reward = 0
done = False
while not done:
obs_modified = self.modify_obs(obs)
action = solution.get_action(obs_modified)
obs, reward, done, _ = self.env.step(action)
ep_reward += reward
if self.render:
self.env.render()
return ep_reward
================================================
FILE: github_adventures/neuron/torch_utils.py
================================================
import numpy as np
import torch
import torch.nn as nn
class MLP(nn.Module):
"""Multilayer perceptron policy network.
Parameters
----------
n_features : int
Number of input features.
hidden_layer_sizes : tuple
Tuple of int that defines the sizes of all hidden layers.
Attributes
----------
net : nn.Sequential
The actual network.
"""
def __init__(self, n_features, hidden_layer_sizes):
super().__init__()
layer_sizes = (n_features,) + hidden_layer_sizes + (1,)
layers = []
for i in range(len(layer_sizes) - 1):
in_features = layer_sizes[i]
out_features = layer_sizes[i + 1]
layers.extend(
[
nn.Linear(in_features, out_features),
nn.Tanh(),
]
)
self.net = nn.Sequential(*layers)
for p in self.parameters():
p.requires_grad = False
def forward(self, obs):
"""Run forward pass.
Parameters
----------
obs : torch.Tensor
1D tensor representing the input observation of shape
`(n_features,)`.
Returns
-------
torch.Tensor
Scalar between -1 and 1 representing the action.
"""
return self.net(obs[None, :])[0]
def pos_table(n_embeddings, hidden_size):
"""Create a table of positional encodings.
Parameters
----------
n_embeddings : int
Number of rows of the table.
hidden_size : int
Number of columns of the table.
Returns
-------
tab : np.ndarray
2D array holding the positional encodings.
"""
def get_angle(x, h):
return x / np.power(10000, 2 * (h // 2) / hidden_size)
def get_angle_vec(x):
return [get_angle(x, j) for j in range(hidden_size)]
tab = np.array([get_angle_vec(i) for i in range(n_embeddings)]).astype(
float
)
tab[:, 0::2] = np.sin(tab[:, 0::2])
tab[:, 1::2] = np.cos(tab[:, 1::2])
return tab
class AttentionMatrix(nn.Module):
"""Generates attention matrix using the key and query tensors.
Parameters
----------
proj_dim : int
Size of the space to which we project the K and Q tensors.
hidden_size : int
Dimensionality of the Q and K tensors before linear projections.
scale : bool
If True, then the attention matrix will be divided by
`proj_dim ** (1 / 2)` elementwise.
Attributes
----------
proj_q, proj_k : torch.nn.Linear
Linear models projecting the Q and K tensors.
scalar : float
Number used for scaling the attention matrix elementwise.
"""
def __init__(self, hidden_size, proj_dim, scale=True):
super().__init__()
self.proj_q = nn.Linear(
in_features=hidden_size, out_features=proj_dim, bias=False
)
self.proj_k = nn.Linear(
in_features=hidden_size, out_features=proj_dim, bias=False
)
if scale:
self.scalar = np.sqrt(proj_dim)
else:
self.scalar = 1
def forward(self, data_q, data_k):
"""Run the forward pass.
Parameters
----------
data_q : torch.Tensor
Query tensor of shape `(n_embeddings, hidden_size)`.
data_k : torch.Tensor
Key tensor of shape `(n_features, hidden_size)`.
Returns
-------
attention_weights : torch.Tensor
Attention weights (don't sum up to 1 in general) of shape
`(n_embeddings, n_features)`.
"""
q = self.proj_q(data_q) # (n_embeddings, proj_dim)
k = self.proj_k(data_k) # (n_features, proj_dim)
dot = q @ k.T # (n_embeddings, n_features)
dot_scaled = torch.div(dot, self.scalar) # (n_embeddings, n_features)
attention_weights = torch.tanh(
dot_scaled
) # (n_embeddings, n_features)
return attention_weights
class AttentionNeuron(nn.Module):
"""Permutation invariant layer.
Parameters
----------
n_embeddings : int
Number of rows in the Q tensor. In our case it is equal to the length
of the latent code `m`.
proj_dim : int
Size of the space to which we project the K and Q tensors.
hidden_size : int
The dimensionality of the Q and K tensors before linear projections.
Attributes
----------
hx : tuple or None
If not None then a tuple of 2 hidden state tensors (LSTM specific)
lstm : nn.LSTMCell
LSTM cell that inputs a hidden state and an observation and
outputs a new hidden state.
attention_matrix : AttentionMatrix
Attention matrix (only needs Q and K tensors).
Q : torch.Tensor
Query tensor that is not learnable since it is populated with
positional encodings.
"""
def __init__(
self,
n_embeddings=16,
proj_dim=32,
hidden_size=8,
):
super().__init__()
self.n_embeddings = n_embeddings
self.proj_dim = proj_dim
self.hidden_size = hidden_size
# Modules
self.hx = None
self.lstm = nn.LSTMCell(input_size=2, hidden_size=hidden_size)
self.attention_matrix = AttentionMatrix(
hidden_size=hidden_size,
proj_dim=proj_dim,
scale=False,
)
self.register_buffer(
"Q",
torch.from_numpy(
pos_table(
n_embeddings,
hidden_size,
)
).float(),
)
def forward(self, obs, prev_action):
"""Run forward pass.
Parameters
----------
obs : torch.Tensor
1D tensor representing the input observations of shape
`(n_features,)`.
prev_action : float
Number between -1 and 1 based on what the previous action was.
Returns
-------
latent_code : torch.Tensor
1D tensor representing the latent code of shape `(n_embeddings,)`.
attn_weights : torch.Tensor
2D tensor of shape `(n_embeddings, n_features)` representing
attention weights.
"""
n_features = len(obs)
prev_action = float(prev_action)
obs_and_act = torch.cat(
[
obs[:, None],
torch.ones(n_features, 1) * prev_action,
],
dim=-1,
) # (n_features, 2)
if self.hx is None:
self.hx = (
torch.zeros(n_features, self.hidden_size),
torch.zeros(n_features, self.hidden_size),
)
self.hx = self.lstm(
obs_and_act, self.hx
) # Tuple[(n_features, hidden_size)]
data_q = self.Q # (n_embeddings, hidden_size)
data_k = self.hx[0] # (n_features, hidden_size)
data_v = obs[:, None] # (n_features, 1)
attn_weights = self.attention_matrix(
data_q=data_q, data_k=data_k
) # (n_embeddings, n_features)
latent_code_ = torch.tanh(attn_weights @ data_v) # (n_embeddings, 1)
latent_code = latent_code_.squeeze() # (n_embeddings,)
return latent_code, attn_weights
class PermutationInvariantNetwork(nn.Module):
"""Permutation invariant policy network.
Parameters
----------
n_embeddings : int
Number of rows in the Q tensor.
proj_dim : int
Size of the space to which we project the K and Q tensors.
hidden_size : int
Dimensionality of the Q and K matrices before linear projections.
Attributes
----------
attention_neuron : AttentionNeuron
Permutation invariant layer that generates latent codes.
linear : nn.Linear
Maps the latent code into a single number.
"""
def __init__(
self,
n_embeddings=16,
proj_dim=32,
hidden_size=8,
):
super().__init__()
self.attention_neuron = AttentionNeuron(
n_embeddings=n_embeddings,
proj_dim=proj_dim,
hidden_size=hidden_size,
)
self.linear = nn.Linear(n_embeddings, 1)
for p in self.parameters():
p.requires_grad = False
def forward(self, obs, prev_action):
"""Run forward pass.
Parameters
----------
obs : torch.Tensor
1D tensor representing the input observations of shape
`(n_features,)`.
prev_action : float
Number between -1 and 1 based on what the previous action was.
Returns
-------
y : torch.Tensor
Scalar tensor with a value in range (-1, 1) representing the
next action.
"""
latent_code, _ = self.attention_neuron(
obs, prev_action
) # (n_embeddings,)
y_ = torch.tanh(self.linear(latent_code[None, :])) # (1, 1)
y = y_[0] # (1,)
return y
================================================
FILE: github_adventures/neuron/trainer.py
================================================
import argparse
import json
import multiprocessing as mp
import pathlib
import pickle
from functools import partial
import cma
import numpy as np
import tqdm
from torch.utils.tensorboard import SummaryWriter
from solutions import (
MLPSolution,
PermutationInvariantSolution,
)
from tasks import Task, N_ORIGINAL_FEATURES
def save(folder, n_iter, solver, solution_inst):
"""Save checkpoint.
Parameters
----------
folder : str
Output folder.
n_iter : int
Iteration that corresponds to the checkpoint.
solver : cma.CMAEvolutionStrategy
Solver instance.
solution_inst : Solution
Solution instance.
"""
folder = pathlib.Path(folder)
folder.mkdir(parents=True, exist_ok=True)
path = folder / f"{n_iter}.pkl"
with path.open("wb") as f:
obj = (solver, solution_inst)
pickle.dump(obj, f)
def get_fitness(
solution_inst,
*,
shuffle_on_reset,
n_episodes,
n_noise_features,
env_seed,
feature_seed,
):
"""Get fitness function used by the CMA optimizer/solver.
Can be run independently on a single worker.
Returns
-------
fitness : list
List of floats of length `n_episodes` holding the per episode reward.
"""
task = Task(
render=False,
shuffle_on_reset=shuffle_on_reset,
n_noise_features=n_noise_features,
env_seed=env_seed,
feature_seed=feature_seed,
)
fitness = [task.rollout(solution_inst) for _ in range(n_episodes)]
return fitness
def main(argv=None):
parser = argparse.ArgumentParser(
"Training",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"solution",
type=str,
choices=(
"linear",
"MLP",
"invariant",
),
)
parser.add_argument(
"log_dir",
type=str,
help="Logging folder",
)
parser.add_argument(
"--checkpoint",
type=str,
help="Pickled solver and solution",
)
parser.add_argument(
"--env-seed",
type=int,
)
parser.add_argument(
"--eval-frequency",
type=int,
default=25,
)
parser.add_argument(
"--feature-seed",
type=int,
)
parser.add_argument(
"-m",
"--max-iter",
type=int,
default=10000,
help="Maximum number of iterations",
)
parser.add_argument(
"-e",
"--n-episodes",
type=int,
default=16,
help="Number of rollouts for fitness evaluation",
)
parser.add_argument(
"-j",
"--n-jobs",
type=int,
default=-1,
help="Number of processes",
)
parser.add_argument(
"-n",
"--n-noise-features",
type=int,
default=0,
help="Number of noise features",
)
parser.add_argument(
"-p",
"--population-size",
type=int,
default=256,
help="Number of solutions per generation",
)
parser.add_argument(
"-s",
"--shuffle-on-reset",
action="store_true",
help="Shuffle features before each rollout",
)
args = parser.parse_args(argv)
writer = SummaryWriter(args.log_dir)
writer.add_text("parameters", json.dumps(vars(args)))
# Solution map
if args.solution == "linear":
solution_inst = MLPSolution(
n_features=N_ORIGINAL_FEATURES + args.n_noise_features,
hidden_layer_sizes=tuple(),
)
elif args.solution == "MLP":
solution_inst = MLPSolution(
n_features=N_ORIGINAL_FEATURES + args.n_noise_features,
hidden_layer_sizes=(16,),
)
elif args.solution == "invariant":
solution_inst = PermutationInvariantSolution(
n_embeddings=16,
proj_dim=32,
hidden_size=8,
)
else:
raise ValueError
# Prepare solver
if args.checkpoint is None:
x0 = np.zeros(solution_inst.get_n_params())
solver = cma.CMAEvolutionStrategy(
x0=x0,
sigma0=0.1,
inopts={
"popsize": args.population_size,
"seed": 42,
"randn": np.random.randn,
},
)
else:
with open(args.checkpoint, "rb") as f:
solver, solution_inst_ = pickle.load(f)
assert isinstance(solution_inst, solution_inst_.__class__)
solution_inst = solution_inst_
get_fitness_partial = partial(
get_fitness,
n_episodes=args.n_episodes,
shuffle_on_reset=args.shuffle_on_reset,
n_noise_features=args.n_noise_features,
env_seed=args.env_seed,
feature_seed=args.feature_seed,
)
if args.n_jobs == -1:
n_jobs = mp.cpu_count()
else:
n_jobs = args.n_jobs
with mp.Pool(processes=n_jobs) as pool:
for n_iter in tqdm.tqdm(range(args.max_iter)):
try:
params_set = solver.ask()
iterable = [
solution_inst.clone().set_params(p) for p in params_set
]
rewards = pool.map(get_fitness_partial, iterable)
pos_fitnesses = [np.mean(r) for r in rewards]
neg_fitnesses = [-x for x in pos_fitnesses]
all_parameters = np.concatenate(params_set)
metrics = {
"parameter_mean": all_parameters.mean(),
"parameter_std": all_parameters.std(),
"mean": np.mean(pos_fitnesses),
"max (generation)": np.max(pos_fitnesses),
"max (overall)": -solver.result.fbest,
}
for metric_name, metric in metrics.items():
writer.add_scalar(metric_name, metric, global_step=n_iter)
if (n_iter % args.eval_frequency == 0) or (
n_iter == (args.max_iter - 1)
):
save(args.log_dir, n_iter, solver, solution_inst)
solver.tell(params_set, neg_fitnesses)
except KeyboardInterrupt:
save(
args.log_dir,
n_iter,
solver,
solution_inst,
)
break
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/pondernet/experiment_1.sh
================================================
set -x
SEED=$RANDOM
LAMBDAS=(0.1 0.3 0.5 0.7 0.9)
for lambda in ${LAMBDAS[@]}
do
python train.py \
--batch-size 128 \
--beta 0.01 \
--device cuda \
--eval-frequency 4000 \
--n-iter 100000 \
--n-hidden 128 \
--lambda-p $lambda \
--n-elems 15 \
results/experiment_a/$SEED/lambda_$lambda
done
================================================
FILE: github_adventures/pondernet/experiment_2.sh
================================================
set -x
SEED=$RANDOM
python train.py \
--batch-size 128 \
--beta 0.01 \
--eval-frequency 4000 \
--device cuda \
--lambda-p 0.2 \
--n-elems 30 \
--n-iter 1500000 \
--n-hidden 128 \
--n-nonzero 1 25 \
results/experiment_b/$SEED
================================================
FILE: github_adventures/pondernet/requirements.txt
================================================
matplotlib
numpy
tensorboard
torch
tqdm
================================================
FILE: github_adventures/pondernet/train.py
================================================
from argparse import ArgumentParser
import json
import pathlib
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from utils import (
ParityDataset,
PonderNet,
ReconstructionLoss,
RegularizationLoss,
)
@torch.no_grad()
def evaluate(dataloader, module):
"""Compute relevant metrics.
Parameters
----------
dataloader : DataLoader
Dataloader that yields batches of `x` and `y`.
module : PonderNet
Our pondering network.
Returns
-------
metrics_single : dict
Scalar metrics. The keys are names and the values are `torch.Tensor`.
These metrics are computed as mean values over the entire dataset.
metrics_per_step : dict
Per step metrics. The keys are names and the values are `torch.Tensor`
of shape `(max_steps,)`. These metrics are computed as mean values over
the entire dataset.
"""
# Imply device and dtype
param = next(module.parameters())
device, dtype = param.device, param.dtype
metrics_single_ = {
"accuracy_halted": [],
"halting_step": [],
}
metrics_per_step_ = {
"accuracy": [],
"p": [],
}
for x_batch, y_true_batch in dataloader:
x_batch = x_batch.to(device, dtype) # (batch_size, n_elems)
y_true_batch = y_true_batch.to(device, dtype) # (batch_size,)
y_pred_batch, p, halting_step = module(x_batch)
y_halted_batch = y_pred_batch.gather(
dim=0,
index=halting_step[None, :] - 1,
)[
0
] # (batch_size,)
# Computing single metrics (mean over samples in the batch)
accuracy_halted = (
((y_halted_batch > 0) == y_true_batch).to(torch.float32).mean()
)
metrics_single_["accuracy_halted"].append(accuracy_halted)
metrics_single_["halting_step"].append(
halting_step.to(torch.float).mean()
)
# Computing per step metrics (mean over samples in the batch)
accuracy = (
((y_pred_batch > 0) == y_true_batch[None, :])
.to(torch.float32)
.mean(dim=1)
)
metrics_per_step_["accuracy"].append(accuracy)
metrics_per_step_["p"].append(p.mean(dim=1))
metrics_single = {
name: torch.stack(values).mean(dim=0).cpu().numpy()
for name, values in metrics_single_.items()
}
metrics_per_step = {
name: torch.stack(values).mean(dim=0).cpu().numpy()
for name, values in metrics_per_step_.items()
}
return metrics_single, metrics_per_step
def plot_distributions(target, predicted):
"""Create a barplot.
Parameters
----------
target, predicted : np.ndarray
Arrays of shape `(max_steps,)` representing the target and predicted
probability distributions.
Returns
-------
matplotlib.Figure
"""
support = list(range(1, len(target) + 1))
fig, ax = plt.subplots(dpi=140)
ax.bar(
support,
target,
color="red",
label=f"Target - Geometric({target[0].item():.2f})",
)
ax.bar(
support,
predicted,
color="green",
width=0.4,
label="Predicted",
)
ax.set_ylim(0, 0.6)
ax.set_xticks(support)
ax.legend()
ax.grid()
return fig
def plot_accuracy(accuracy):
"""Create a barplot representing accuracy over different halting steps.
Parameters
----------
accuracy : np.array
1D array representing accuracy if we were to take the output after
the corresponding step.
Returns
-------
matplotlib.Figure
"""
support = list(range(1, len(accuracy) + 1))
fig, ax = plt.subplots(dpi=140)
ax.bar(
support,
accuracy,
label="Accuracy over different steps",
)
ax.set_ylim(0, 1)
ax.set_xticks(support)
ax.legend()
ax.grid()
return fig
def main(argv=None):
"""CLI for training."""
parser = ArgumentParser()
parser.add_argument(
"log_folder",
type=str,
help="Folder where tensorboard logging is saved",
)
parser.add_argument(
"--batch-size",
type=int,
default=128,
help="Batch size",
)
parser.add_argument(
"--beta",
type=float,
default=0.01,
help="Regularization loss coefficient",
)
parser.add_argument(
"-d",
"--device",
type=str,
choices={"cpu", "cuda"},
default="cpu",
help="Device to use",
)
parser.add_argument(
"--eval-frequency",
type=int,
default=10_000,
help="Evaluation is run every `eval_frequency` steps",
)
parser.add_argument(
"--lambda-p",
type=float,
default=0.4,
help="True probability of success for a geometric distribution",
)
parser.add_argument(
"--n-iter",
type=int,
default=1_000_000,
help="Number of gradient steps",
)
parser.add_argument(
"--n-elems",
type=int,
default=64,
help="Number of elements",
)
parser.add_argument(
"--n-hidden",
type=int,
default=64,
help="Number of hidden elements in the reccurent cell",
)
parser.add_argument(
"--n-nonzero",
type=int,
nargs=2,
default=(None, None),
help="Lower and upper bound on nonzero elements in the training set",
)
parser.add_argument(
"--max-steps",
type=int,
default=20,
help="Maximum number of pondering steps",
)
# Parameters
args = parser.parse_args(argv)
print(args)
device = torch.device(args.device)
dtype = torch.float32
n_eval_samples = 1000
batch_size_eval = 50
if args.n_nonzero[0] is None and args.n_nonzero[1] is None:
threshold = int(0.3 * args.n_elems)
range_nonzero_easy = (1, threshold)
range_nonzero_hard = (args.n_elems - threshold, args.n_elems)
else:
range_nonzero_easy = (1, args.n_nonzero[1])
range_nonzero_hard = (args.n_nonzero[1] + 1, args.n_elems)
# Tensorboard
log_folder = pathlib.Path(args.log_folder)
writer = SummaryWriter(log_folder)
writer.add_text("parameters", json.dumps(vars(args)))
# Prepare data
dataloader_train = DataLoader(
ParityDataset(
n_samples=args.batch_size * args.n_iter,
n_elems=args.n_elems,
n_nonzero_min=args.n_nonzero[0],
n_nonzero_max=args.n_nonzero[1],
),
batch_size=args.batch_size,
) # consider specifying `num_workers` for speedups
eval_dataloaders = {
"test": DataLoader(
ParityDataset(
n_samples=n_eval_samples,
n_elems=args.n_elems,
n_nonzero_min=args.n_nonzero[0],
n_nonzero_max=args.n_nonzero[1],
),
batch_size=batch_size_eval,
),
f"{range_nonzero_easy[0]}_{range_nonzero_easy[1]}": DataLoader(
ParityDataset(
n_samples=n_eval_samples,
n_elems=args.n_elems,
n_nonzero_min=range_nonzero_easy[0],
n_nonzero_max=range_nonzero_easy[1],
),
batch_size=batch_size_eval,
),
f"{range_nonzero_hard[0]}_{range_nonzero_hard[1]}": DataLoader(
ParityDataset(
n_samples=n_eval_samples,
n_elems=args.n_elems,
n_nonzero_min=range_nonzero_hard[0],
n_nonzero_max=range_nonzero_hard[1],
),
batch_size=batch_size_eval,
),
}
# Model preparation
module = PonderNet(
n_elems=args.n_elems,
n_hidden=args.n_hidden,
max_steps=args.max_steps,
)
module = module.to(device, dtype)
# Loss preparation
loss_rec_inst = ReconstructionLoss(
nn.BCEWithLogitsLoss(reduction="none")
).to(device, dtype)
loss_reg_inst = RegularizationLoss(
lambda_p=args.lambda_p,
max_steps=args.max_steps,
).to(device, dtype)
# Optimizer
optimizer = torch.optim.Adam(
module.parameters(),
lr=0.0003,
)
# Training and evaluation loops
iterator = tqdm(enumerate(dataloader_train), total=args.n_iter)
for step, (x_batch, y_true_batch) in iterator:
x_batch = x_batch.to(device, dtype)
y_true_batch = y_true_batch.to(device, dtype)
y_pred_batch, p, halting_step = module(x_batch)
loss_rec = loss_rec_inst(
p,
y_pred_batch,
y_true_batch,
)
loss_reg = loss_reg_inst(
p,
)
loss_overall = loss_rec + args.beta * loss_reg
optimizer.zero_grad()
loss_overall.backward()
torch.nn.utils.clip_grad_norm_(module.parameters(), 1)
optimizer.step()
# Logging
writer.add_scalar("loss_rec", loss_rec, step)
writer.add_scalar("loss_reg", loss_reg, step)
writer.add_scalar("loss_overall", loss_overall, step)
# Evaluation
if step % args.eval_frequency == 0:
module.eval()
for dataloader_name, dataloader in eval_dataloaders.items():
metrics_single, metrics_per_step = evaluate(
dataloader,
module,
)
fig_dist = plot_distributions(
loss_reg_inst.p_g.cpu().numpy(),
metrics_per_step["p"],
)
writer.add_figure(
f"distributions/{dataloader_name}", fig_dist, step
)
fig_acc = plot_accuracy(metrics_per_step["accuracy"])
writer.add_figure(
f"accuracy_per_step/{dataloader_name}", fig_acc, step
)
for metric_name, metric_value in metrics_single.items():
writer.add_scalar(
f"{metric_name}/{dataloader_name}",
metric_value,
step,
)
torch.save(module, log_folder / "checkpoint.pth")
module.train()
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/pondernet/utils.py
================================================
import torch
import torch.nn as nn
from torch.utils.data import Dataset
class ParityDataset(Dataset):
"""Parity of vectors - binary classification dataset.
Parameters
----------
n_samples : int
Number of samples to generate.
n_elems : int
Size of the vectors.
n_nonzero_min, n_nonzero_max : int or None
Minimum (inclusive) and maximum (inclusive) number of nonzero
elements in the feature vector. If not specified then `(1, n_elem)`.
"""
def __init__(
self,
n_samples,
n_elems,
n_nonzero_min=None,
n_nonzero_max=None,
):
self.n_samples = n_samples
self.n_elems = n_elems
self.n_nonzero_min = 1 if n_nonzero_min is None else n_nonzero_min
self.n_nonzero_max = (
n_elems if n_nonzero_max is None else n_nonzero_max
)
assert 0 <= self.n_nonzero_min <= self.n_nonzero_max <= n_elems
def __len__(self):
"""Get the number of samples."""
return self.n_samples
def __getitem__(self, idx):
"""Get a feature vector and it's parity (target).
Note that the generating process is random.
"""
x = torch.zeros((self.n_elems,))
n_non_zero = torch.randint(
self.n_nonzero_min, self.n_nonzero_max + 1, (1,)
).item()
x[:n_non_zero] = torch.randint(0, 2, (n_non_zero,)) * 2 - 1
x = x[torch.randperm(self.n_elems)]
y = (x == 1.0).sum() % 2
return x, y
class PonderNet(nn.Module):
"""Network that ponders.
Parameters
----------
n_elems : int
Number of features in the vector.
n_hidden : int
Hidden layer size of the recurrent cell.
max_steps : int
Maximum number of steps the network can "ponder" for.
allow_halting : bool
If True, then the forward pass is allowed to halt before
reaching the maximum steps.
Attributes
----------
cell : nn.GRUCell
Learnable GRU cell that maps the previous hidden state and the input
to a new hidden state.
output_layer : nn.Linear
Linear module that serves as the binary classifier. It inputs
the hidden state.
lambda_layer : nn.Linear
Linear module that generates the halting probability at each step.
"""
def __init__(
self, n_elems, n_hidden=64, max_steps=20, allow_halting=False
):
super().__init__()
self.max_steps = max_steps
self.n_hidden = n_hidden
self.allow_halting = allow_halting
self.cell = nn.GRUCell(n_elems, n_hidden)
self.output_layer = nn.Linear(n_hidden, 1)
self.lambda_layer = nn.Linear(n_hidden, 1)
def forward(self, x):
"""Run forward pass.
Parameters
----------
x : torch.Tensor
Batch of input features of shape `(batch_size, n_elems)`.
Returns
-------
y : torch.Tensor
Tensor of shape `(max_steps, batch_size)` representing
the predictions for each step and each sample. In case
`allow_halting=True` then the shape is
`(steps, batch_size)` where `1 <= steps <= max_steps`.
p : torch.Tensor
Tensor of shape `(max_steps, batch_size)` representing
the halting probabilities. Sums over rows (fixing a sample)
are 1. In case `allow_halting=True` then the shape is
`(steps, batch_size)` where `1 <= steps <= max_steps`.
halting_step : torch.Tensor
An integer for each sample in the batch that corresponds to
the step when it was halted. The shape is `(batch_size,)`. The
minimal value is 1 because we always run at least one step.
"""
batch_size, _ = x.shape
device = x.device
h = x.new_zeros(batch_size, self.n_hidden)
un_halted_prob = x.new_ones(batch_size)
y_list = []
p_list = []
halting_step = torch.zeros(
batch_size,
dtype=torch.long,
device=device,
)
for n in range(1, self.max_steps + 1):
if n == self.max_steps:
lambda_n = x.new_ones(batch_size) # (batch_size,)
else:
lambda_n = torch.sigmoid(self.lambda_layer(h))[
:, 0
] # (batch_size,)
# Store releavant outputs
y_list.append(self.output_layer(h)[:, 0]) # (batch_size,)
p_list.append(un_halted_prob * lambda_n) # (batch_size,)
halting_step = torch.maximum(
n
* (halting_step == 0)
* torch.bernoulli(lambda_n).to(torch.long),
halting_step,
)
# Prepare for next iteration
un_halted_prob = un_halted_prob * (1 - lambda_n)
h = self.cell(x, h)
# Potentially stop if all samples halted
if self.allow_halting and (halting_step > 0).sum() == batch_size:
break
y = torch.stack(y_list)
p = torch.stack(p_list)
return y, p, halting_step
class ReconstructionLoss(nn.Module):
"""Weighted average of per step losses.
Parameters
----------
loss_func : callable
Loss function that accepts `y_pred` and `y_true` as arguments. Both
of these tensors have shape `(batch_size,)`. It outputs a loss for
each sample in the batch.
"""
def __init__(self, loss_func):
super().__init__()
self.loss_func = loss_func
def forward(self, p, y_pred, y_true):
"""Compute loss.
Parameters
----------
p : torch.Tensor
Probability of halting of shape `(max_steps, batch_size)`.
y_pred : torch.Tensor
Predicted outputs of shape `(max_steps, batch_size)`.
y_true : torch.Tensor
True targets of shape `(batch_size,)`.
Returns
-------
loss : torch.Tensor
Scalar representing the reconstruction loss. It is nothing else
than a weighted sum of per step losses.
"""
max_steps, _ = p.shape
total_loss = p.new_tensor(0.0)
for n in range(max_steps):
loss_per_sample = p[n] * self.loss_func(
y_pred[n], y_true
) # (batch_size,)
total_loss = total_loss + loss_per_sample.mean() # (1,)
return total_loss
class RegularizationLoss(nn.Module):
"""Enforce halting distribution to ressemble the geometric distribution.
Parameters
----------
lambda_p : float
The single parameter determining uniquely the geometric distribution.
Note that the expected value of this distribution is going to be
`1 / lambda_p`.
max_steps : int
Maximum number of pondering steps.
"""
def __init__(self, lambda_p, max_steps=20):
super().__init__()
p_g = torch.zeros((max_steps,))
not_halted = 1.0
for k in range(max_steps):
p_g[k] = not_halted * lambda_p
not_halted = not_halted * (1 - lambda_p)
self.register_buffer("p_g", p_g)
self.kl_div = nn.KLDivLoss(reduction="batchmean")
def forward(self, p):
"""Compute loss.
Parameters
----------
p : torch.Tensor
Probability of halting of shape `(steps, batch_size)`.
Returns
-------
loss : torch.Tensor
Scalar representing the regularization loss.
"""
steps, batch_size = p.shape
p = p.transpose(0, 1) # (batch_size, max_steps)
p_g_batch = self.p_g[None, :steps].expand_as(
p
) # (batch_size, max_steps)
return self.kl_div(p.log(), p_g_batch)
================================================
FILE: github_adventures/product_quantization/README.md
================================================
# Installation
Run the following to get all the dependencies.
```
pip install -r requirements.txt
```
# Faiss 101
The code for the short intro to FAISS can be found in `faiss_101_ipython.py`.
Note that you can use `parse.py` to turn the raw fasttext embeddings
into a numpy array. See `run_all.sh` for example usage.
# Custom PQ implementation
The custom PQ implementation can be found inside of `custom.py`.
# End to end script
The script `run_all.sh` does the following things:
* Download fasttext embeddings
* Train multiple indexes (faiss + custom) using the embeddings
* Serve gradio apps for similarity search comparing different indexes
```
chmod +x run_all.sh
./run_all
```
Don't forget to kill the Gradio processes by `pkill -f gradio` once you
don't need them anymore.
================================================
FILE: github_adventures/product_quantization/convert.py
================================================
import argparse
import logging
import pathlib
import pickle
import faiss
from custom import CustomIndexPQ
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
def from_faiss(faiss_index: faiss.swigfaiss.IndexPQ) -> CustomIndexPQ:
if not faiss_index.is_trained:
raise ValueError("The faiss index is not trained")
if faiss_index.ntotal == 0:
raise ValueError("The faiss index has no codes")
d = faiss_index.d
m = faiss_index.code_size
nbits = faiss_index.pq.nbits
k = 2**nbits
ntotal = faiss_index.ntotal
custom_index = CustomIndexPQ(d=d, m=m, nbits=nbits)
centers = faiss.vector_to_array(faiss_index.pq.centroids).reshape(
m, k, d // m
)
logger.info("Copying centers from the faiss index")
for i in range(m):
custom_index.estimators[i].cluster_centers_ = centers[i]
custom_index.is_trained = True
logger.info("Copying codes form the faiss index")
custom_index.codes = faiss.vector_to_array(faiss_index.codes).reshape(
ntotal, m
)
return custom_index
def main() -> int:
parser = argparse.ArgumentParser("Convert from faiss to custom")
parser.add_argument(
"faiss_index_path",
type=pathlib.Path,
help="Path to a faiss index",
)
parser.add_argument(
"output_index_path",
type=pathlib.Path,
help="Path to a new custom index with faiss parameters",
)
args = parser.parse_args()
faiss_index = faiss.read_index(str(args.faiss_index_path))
custom_index = from_faiss(faiss_index)
with args.output_index_path.open("wb") as f:
pickle.dump(custom_index, f)
if __name__ == "__main__":
main()
================================================
FILE: github_adventures/product_quantization/custom.py
================================================
from __future__ import annotations
import logging
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
logger = logging.getLogger(__name__)
BITS2DTYPE = {
8: np.uint8,
}
class CustomIndexPQ:
"""Custom IndexPQ implementation.
Parameters
----------
d
Dimensionality of the original vectors.
m
Number of segments.
nbits
Number of bits.
estimator_kwargs
Additional hyperparameters passed onto the sklearn KMeans
class.
"""
def __init__(
self,
d: int,
m: int,
nbits: int,
**estimator_kwargs: str | int,
) -> None:
if d % m != 0:
raise ValueError("d needs to be a multiple of m")
if nbits not in BITS2DTYPE:
raise ValueError(f"Unsupported number of bits {nbits}")
self.m = m
self.k = 2**nbits
self.d = d
self.ds = d // m
self.estimators = [
KMeans(n_clusters=self.k, **estimator_kwargs) for _ in range(m)
]
logger.info(f"Creating following estimators: {self.estimators[0]!r}")
self.is_trained = False
self.dtype = BITS2DTYPE[nbits]
self.dtype_orig = np.float32
self.codes: np.ndarray | None = None
def train(self, X: np.ndarray) -> None:
"""Train all KMeans estimators.
Parameters
----------
X
Array of shape `(n, d)` and dtype `float32`.
"""
if self.is_trained:
raise ValueError("Training multiple times is not allowed")
for i in range(self.m):
estimator = self.estimators[i]
X_i = X[:, i * self.ds : (i + 1) * self.ds]
logger.info(f"Fitting KMeans for the {i}-th segment")
estimator.fit(X_i)
self.is_trained = True
def encode(self, X: np.ndarray) -> np.ndarray:
"""Encode original features into codes.
Parameters
----------
X
Array of shape `(n_queries, d)` of dtype `np.float32`.
Returns
-------
result
Array of shape `(n_queries, m)` of dtype `np.uint8`.
"""
n = len(X)
result = np.empty((n, self.m), dtype=self.dtype)
for i in range(self.m):
estimator = self.estimators[i]
X_i = X[:, i * self.ds : (i + 1) * self.ds]
result[:, i] = estimator.predict(X_i)
return result
def add(self, X: np.ndarray) -> None:
"""Add vectors to the database (their encoded versions).
Parameters
----------
X
Array of shape `(n_codes, d)` of dtype `np.float32`.
"""
if not self.is_trained:
raise ValueError("The quantizer needs to be trained first.")
self.codes = self.encode(X)
def compute_asymmetric_distances(self, X: np.ndarray) -> np.ndarray:
"""Compute asymmetric distances to all database codes.
Parameters
----------
X
Array of shape `(n_queries, d)` of dtype `np.float32`.
Returns
-------
distances
Array of shape `(n_queries, n_codes)` of dtype `np.float32`.
"""
if not self.is_trained:
raise ValueError("The quantizer needs to be trained first.")
if self.codes is None:
raise ValueError("No codes detected. You need to run `add` first")
n_queries = len(X)
n_codes = len(self.codes)
distance_table = np.empty(
(n_queries, self.m, self.k), dtype=self.dtype_orig
) # (n_queries, m, k)
for i in range(self.m):
X_i = X[:, i * self.ds : (i + 1) * self.ds] # (n_queries, ds)
centers = self.estimators[i].cluster_centers_ # (k, ds)
distance_table[:, i, :] = euclidean_distances(
X_i, centers, squared=True
)
distances = np.zeros((n_queries, n_codes), dtype=self.dtype_orig)
for i in range(self.m):
distances += distance_table[:, i, self.codes[:, i]]
return distances
def search(self, X: np.ndarray, k: int) -> tuple[np.ndarray, np.ndarray]:
"""Find k closest database codes to given queries.
Parameters
----------
X
Array of shape `(n_queries, d)` of dtype `np.float32`.
k
The number of closest codes to look for.
Returns
-------
distances
Array of shape `(n_queries, k)`.
indices
Array of shape `(n_queries, k)`.
"""
n_queries = len(X)
distances_all = self.compute_asymmetric_distances(X)
indices = np.argsort(distances_all, axis=1)[:, :k]
distances = np.empty((n_queries, k), dtype=np.float32)
for i in range(n_queries):
distances[i] = distances_all[i][indices[i]]
return distances, indices
================================================
FILE: github_adventures/product_quantization/faiss_101_ipython.py
================================================
import numpy as np
import faiss
# Load fast text embeddings
embs = np.load("parsed_fasttext/embs.npy") # change path if necessary
embs.shape
embs.nbytes / 1e6
# Prepare parameters
d = embs.shape[1]
m = 10
nbits = 8
k = 2 ** nbits
k
# Construct index
index = faiss.IndexPQ(d, m, nbits)
index.is_trained
# Try encoding without any training
index.sa_encode(embs[:2])
# Train the model
index.train(embs)
index.is_trained
index.ntotal
# Add vectors to the database
index.add(embs)
index.ntotal
codes = faiss.vector_to_array(index.codes).reshape(index.ntotal, m)
codes[:3]
codes.nbytes / 1e6
# Try searching - EXHAUSTIVE SEARCH
index.search(embs[:3], 4)
# Quickly show that with flat index distances are precise
flat_index = faiss.IndexFlatL2(d)
flat_index.train(embs)
flat_index.add(embs)
flat_index.search(embs[:3], 4)
================================================
FILE: github_adventures/product_quantization/generate_index.py
================================================
from __future__ import annotations
import argparse
import logging
import pathlib
import pickle
import faiss
import numpy as np
from custom import CustomIndexPQ
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"input_path",
type=pathlib.Path,
help="Path to the full embeddings array",
)
parser.add_argument(
"index_type",
type=str,
choices=["faiss-flat", "faiss-pq", "our-pq"],
help="Type of index to generate",
)
parser.add_argument(
"output_path",
type=pathlib.Path,
help="Path to where to store the index"
)
args, unknown_kwargs = parser.parse_known_args()
hyperparams: dict[str, int] = {}
for i in range(0, len(unknown_kwargs), 2):
key_raw, value_raw = unknown_kwargs[i], unknown_kwargs[i + 1]
key = key_raw.strip("--")
value = int(value_raw) if value_raw.isnumeric() else value_raw
hyperparams[key] = value
logger.info(f"The following hyperparameters were detected {hyperparams}")
logger.info("Loading embeddings")
embs = np.load(args.input_path)
n, d = embs.shape
if args.index_type == "faiss-flat":
logger.info("Instantiating IndexFlatL2")
index = faiss.IndexFlatL2(d)
elif args.index_type == "faiss-pq":
logger.info("Instantiating IndexPQ")
arguments = [d, hyperparams["m"], hyperparams["nbits"]]
index = faiss.IndexPQ(*arguments)
elif args.index_type == "our-pq":
logger.info("Instantiating CustomIndexPQ")
index = CustomIndexPQ(d, **hyperparams)
logger.info("Training the index")
index.train(embs)
logger.info("Adding all embeddings to the index")
index.add(embs)
logger.info(f"Writing index to disk - {args.output_path}")
if args.index_type == "our-pq":
with args.output_path.open("wb") as f:
pickle.dump(index, f)
else:
faiss.write_index(index, str(args.output_path))
================================================
FILE: github_adventures/product_quantization/parse.py
================================================
from __future__ import annotations
import argparse
import io
import logging
import pathlib
import tqdm
import numpy as np
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
def get_embeddings(path: str, maximum: int | None = None) -> tuple[list[str], np.ndarray]:
fin = io.open(path, 'r', encoding='utf-8', newline='\n', errors='ignore')
n, d = map(int, fin.readline().split())
n = n if maximum is None else min(n, maximum)
embs: np.ndarray = np.empty((n, d), dtype=np.float32)
words: list[str] = []
for i, line in tqdm.tqdm(enumerate(fin)):
if maximum is not None and i == maximum:
break
tokens = line.rstrip().split(' ')
words.append(tokens[0])
embs[i] = list(map(float, tokens[1:]))
return words, embs
parser = argparse.ArgumentParser()
parser.add_argument(
"fasttext_path",
type=pathlib.Path,
help="Path to fasttext embeddings.",
)
parser.add_argument(
"output_dir",
type=pathlib.Path,
help="Directory where we store the words and the embeddings."
)
parser.add_argument(
"-m",
"--max",
type=int,
help="Maximum number of embeddings to parse."
)
args = parser.parse_args()
path_embs = args.output_dir / "embs.npy"
path_words = args.output_dir / "words.txt"
args.output_dir.mkdir(exist_ok=True, parents=True)
logger.info("Parsing")
words, embs = get_embeddings(args.fasttext_path, maximum=args.max)
logger.info("Saving words")
with path_words.open("w") as f:
for word in words:
f.write(word + "\n")
logger.info("Saving embeddings")
np.save(path_embs, embs)
================================================
FILE: github_adventures/product_quantization/requirements.txt
================================================
faiss-cpu==1.7.2
gradio==3.0.17
numpy==1.22.4
pandas==1.4.2
scikit-learn==1.1.1
================================================
FILE: github_adventures/product_quantization/run_all.sh
================================================
set -ex
# Parameters
URL=https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
RAW_FASTTEXT=raw_fasttext.vec
MAX_WORDS=100000
OUTPUT_FOLDER=new_results # no slash
SCIKIT_KWARGS='--n_init 1 --max_iter 30 --init random'
# Download fasttext embeddings
if [ ! -f $RAW_FASTTEXT ]
then
curl $URL --output $RAW_FASTTEXT.gz
gzip -d $RAW_FASTTEXT.gz
fi
mkdir $OUTPUT_FOLDER
# Parse raw data
python parse.py $RAW_FASTTEXT $OUTPUT_FOLDER -m $MAX_WORDS
# Generate a couple of different indexes
python generate_index.py \
$OUTPUT_FOLDER/embs.npy \
faiss-flat \
$OUTPUT_FOLDER/flat.faiss
python generate_index.py \
$OUTPUT_FOLDER/embs.npy \
faiss-pq \
$OUTPUT_FOLDER/faisspq_m4_nbits8.faiss \
--m 4 \
--nbits 8
python generate_index.py \
$OUTPUT_FOLDER/embs.npy \
faiss-pq \
$OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \
--m 12 \
--nbits 8
python generate_index.py \
$OUTPUT_FOLDER/embs.npy \
our-pq \
$OUTPUT_FOLDER/custompq_m4_nbits8.pkl \
--m 4 \
--nbits 8 \
$SCIKIT_KWARGS
python generate_index.py \
$OUTPUT_FOLDER/embs.npy \
our-pq \
$OUTPUT_FOLDER/custompq_m12_nbits8.pkl \
--m 12 \
--nbits 8 \
$SCIKIT_KWARGS
# Convert faiss index into custom index
python convert.py \
$OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \
$OUTPUT_FOLDER/converted_faisspq_m12_nbits8.pkl
# Run webapp
GRADIO_SERVER_PORT=7777 python run_gradio.py \
$OUTPUT_FOLDER/flat.faiss \
$OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \
$OUTPUT_FOLDER/converted_faisspq_m12_nbits8.pkl \
$OUTPUT_FOLDER/words.txt \
&
GRADIO_SERVER_PORT=7778 python run_gradio.py \
$OUTPUT_FOLDER/flat.faiss \
$OUTPUT_FOLDER/faisspq_m4_nbits8.faiss \
$OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \
$OUTPUT_FOLDER/words.txt \
&
GRADIO_SERVER_PORT=7779 python run_gradio.py \
$OUTPUT_FOLDER/flat.faiss \
$OUTPUT_FOLDER/custompq_m4_nbits8.pkl \
$OUTPUT_FOLDER/custompq_m12_nbits8.pkl \
$OUTPUT_FOLDER/words.txt \
&
# make sure to kill the gradio processes pkill -f gradio
================================================
FILE: github_adventures/product_quantization/run_gradio.py
================================================
from __future__ import annotations
import argparse
import logging
import pathlib
import pickle
import time
from functools import partial
from typing import Any
import faiss
import gradio as gr
import numpy as np
import pandas as pd
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser()
parser.add_argument(
"exact_index_path",
type=pathlib.Path,
help="Path to the exact index",
)
parser.add_argument(
"approximate_index_path",
type=pathlib.Path,
nargs="+",
help="Path to the approximate index",
)
parser.add_argument(
"words_path",
type=pathlib.Path,
help="Path to the text file containing words",
)
args = parser.parse_args()
def run(
word: str,
k: int,
exact_index,
approximate_indexes: dict[str, Any],
words: list[str],
word2ix: dict[str, int],
) -> tuple[pd.DataFrame, pd.DataFrame, dict[str, float]]:
metrics = {}
emb = exact_index.reconstruct(word2ix[word])
start = time.monotonic()
D, I = exact_index.search(emb[None, :], k)
metrics["time_exact"] = time.monotonic() - start
D, I = D[0], I[0]
df_e = pd.DataFrame({
"ix": I,
"distance": D,
"word": [words[i] for i in I],
})
dfs_a = []
for name, approximate_index in approximate_indexes.items():
start = time.monotonic()
D, I = approximate_index.search(emb[None, :], k)
metrics[f"time_approximate_{name}"] = time.monotonic() - start
D, I = D[0], I[0]
df_a = pd.DataFrame({
"ix": I,
"distance": D,
"word": [words[i] for i in I],
})
dfs_a.append(df_a)
metrics[f"recall_{name}"] = len(np.intersect1d(df_e.word.unique(), df_a.word.unique())) / k
return df_e, *dfs_a, metrics
logger.info(f"Loading words {args.words_path}")
words = args.words_path.read_text().strip().split("\n")
word2ix = {word: i for i, word in enumerate(words)}
logger.info(f"Loading exact index {args.exact_index_path}")
exact_index = faiss.read_index(str(args.exact_index_path))
logger.info(f"Loading approximate indexes {args.approximate_index_path}")
approximate_indexes = {
}
for path in args.approximate_index_path:
if path.suffix in {".pkl", "pickle"}:
with path.open("rb") as f:
approximate_indexes[path.stem] = pickle.load(f)
else:
approximate_indexes[path.stem] = faiss.read_index(str(path))
# Sanity checks
assert isinstance(exact_index, faiss.IndexFlat)
# assert len(words) == exact_index.ntotal == approximate_index.ntotal
run_partial = partial(
run,
exact_index=exact_index,
approximate_indexes=approximate_indexes,
words=words,
word2ix=word2ix,
)
setattr(run_partial, "__name__", "run_function")
demo = gr.Interface(
fn=run_partial,
inputs=[
gr.Textbox(lines=1, placeholder="Word here..."),
gr.Slider(minimum=1, maximum=20, value=5, step=1),
],
outputs=[
gr.DataFrame(label="exact"),
*[gr.DataFrame(label=name) for name in approximate_indexes.keys()],
gr.JSON(label="metrics"),
],
allow_flagging="never",
)
demo.launch()
================================================
FILE: github_adventures/siren/activations.py
================================================
import pathlib
from functools import partial
import torch
from torch.utils.tensorboard import SummaryWriter
from core import ImageSiren
torch.manual_seed(2)
init_functions = {
"ones": torch.nn.init.ones_,
"eye": torch.nn.init.eye_,
"default": partial(torch.nn.init.kaiming_uniform_, a=5 ** (1 / 2)),
"paper": None,
}
for fname, func in init_functions.items():
path = pathlib.Path.cwd() / "tensorboard_logs" / fname
writer = SummaryWriter(path)
def fh(inst, inp, out, number=0):
layer_name = f"{number}_{inst.__class__.__name__}"
writer.add_histogram(layer_name, out)
model = ImageSiren(
hidden_layers=10,
hidden_features=200,
first_omega=30,
hidden_omega=30,
custom_init_function_=func,
)
for i, layer in enumerate(model.net.modules()):
if not i:
continue
layer.register_forward_hook(partial(fh, number=(i + 1) // 2))
inp = 2 * (torch.rand(10000, 2) - 0.5)
writer.add_histogram("0", inp)
res = model(inp)
================================================
FILE: github_adventures/siren/core.py
================================================
import numpy as np
import torch
import torch.nn as nn
from scipy.ndimage import laplace, sobel
from torch.utils.data import Dataset
def paper_init_(weight, is_first=False, omega=1):
"""Initialize the weigth of the Linear layer.
Parameters
----------
weight : torch.Tensor
The learnable 2D weight matrix.
is_first : bool
If True, this Linear layer is the very first one in the network.
omega : float
Hyperparamter.
"""
in_features = weight.shape[1]
with torch.no_grad():
if is_first:
bound = 1 / in_features
else:
bound = np.sqrt(6 / in_features) / omega
weight.uniform_(-bound, bound)
class SineLayer(nn.Module):
"""Linear layer followed by the sine activation.
Parameters
----------
in_features : int
Number of input features.
out_features : int
Number of output features.
bias : bool
If True, the bias is included.
is_first : bool
If True, then it represents the first layer of the network. Note that
it influences the initialization scheme.
omega : int
Hyperparameter. Determines scaling.
custom_init_function_ : None or callable
If None, then we are going to use the `paper_init_` defined above.
Otherwise, any callable that modifies the `weight` parameter in place.
Attributes
----------
linear : nn.Linear
Linear layer.
"""
def __init__(
self,
in_features,
out_features,
bias=True,
is_first=False,
omega=30,
custom_init_function_=None,
):
super().__init__()
self.omega = omega
self.linear = nn.Linear(in_features, out_features, bias=bias)
if custom_init_function_ is None:
paper_init_(self.linear.weight, is_first=is_first, omega=omega)
else:
custom_init_function_(self.linear.weight)
def forward(self, x):
"""Run forward pass.
Parameters
----------
x : torch.Tensor
Tensor of shape `(n_samples, in_features)`.
Returns
-------
torch.Tensor
Tensor of shape `(n_samples, out_features).
"""
return torch.sin(self.omega * self.linear(x))
class ImageSiren(nn.Module):
"""Network composed of SineLayers.
Parameters
----------
hidden_features : int
Number of hidden features (each hidden layer the same).
hidden_layers : int
Number of hidden layers.
first_omega, hidden_omega : float
Hyperparameter influencing scaling.
custom_init_function_ : None or callable
If None, then we are going to use the `paper_init_` defined above.
Otherwise any callable that modifies the `weight` parameter in place.
Attributes
----------
net : nn.Sequential
Sequential collection of `SineLayer` and `nn.Linear` at the end.
"""
def __init__(
self,
hidden_features,
hidden_layers=1,
first_omega=30,
hidden_omega=30,
custom_init_function_=None,
):
super().__init__()
in_features = 2
out_features = 1
net = []
net.append(
SineLayer(
in_features,
hidden_features,
is_first=True,
custom_init_function_=custom_init_function_,
omega=first_omega,
)
)
for _ in range(hidden_layers):
net.append(
SineLayer(
hidden_features,
hidden_features,
is_first=False,
custom_init_function_=custom_init_function_,
omega=hidden_omega,
)
)
final_linear = nn.Linear(hidden_features, out_features)
if custom_init_function_ is None:
paper_init_(final_linear.weight, is_first=False, omega=hidden_omega)
else:
custom_init_function_(final_linear.weight)
net.append(final_linear)
self.net = nn.Sequential(*net)
def forward(self, x):
"""Run forward pass.
Parameters
----------
x : torch.Tensor
Tensor of shape `(n_samples, 2)` representing the 2D pixel coordinates.
Returns
-------
torch.Tensor
Tensor of shape `(n_samples, 1)` representing the predicted
intensities.
"""
return self.net(x)
def generate_coordinates(n):
"""Generate regular grid of 2D coordinates on [0, n] x [0, n].
Parameters
----------
n : int
Number of points per dimension.
Returns
-------
coords_abs : np.ndarray
Array of row and column coordinates of shape `(n ** 2, 2)`.
"""
rows, cols = np.meshgrid(range(n), range(n), indexing="ij")
coords_abs = np.stack([rows.ravel(), cols.ravel()], axis=-1)
return coords_abs
class PixelDataset(Dataset):
"""Dataset yielding coordinates, intensitives and (higher) derivatives.
Parameters
----------
img : np.ndarray
2D image representing a grayscale image.
Attributes
----------
size : int
Height and width of the square image.
coords_abs : np.ndarray
Array of shape `(size ** 2, 2)` representing all coordinates of the
`img`.
grad : np.ndarray
Array of shape `(size, size, 2)` representing the approximate
gradient in the two directions.
grad_norm : np.ndarray
Array of shape `(size, size)` representing the approximate gradient
norm of `img`.
laplace : np.ndarray
Array of shape `(size, size)` representing the approximate laplace operator.
"""
def __init__(self, img):
if not (img.ndim == 2 and img.shape[0] == img.shape[1]):
raise ValueError("Only 2D square images are supported.")
self.img = img
self.size = img.shape[0]
self.coords_abs = generate_coordinates(self.size)
self.grad = np.stack([sobel(img, axis=0), sobel(img, axis=1)], axis=-1)
self.grad_norm = np.linalg.norm(self.grad, axis=-1)
self.laplace = laplace(img)
def __len__(self):
"""Determine the number of samples (pixels)."""
return self.size ** 2
def __getitem__(self, idx):
"""Get all relevant data for a single coordinate."""
coords_abs = self.coords_abs[idx]
r, c = coords_abs
coords = 2 * ((coords_abs / self.size) - 0.5)
return {
"coords": coords,
"coords_abs": coords_abs,
"intensity": self.img[r, c],
"grad_norm": self.grad_norm[r, c],
"grad": self.grad[r, c],
"laplace": self.laplace[r, c],
}
class GradientUtils:
@staticmethod
def gradient(target, coords):
"""Compute the gradient with respect to input.
Parameters
----------
target : torch.Tensor
2D tensor of shape `(n_coords, ?)` representing the targets.
coords : torch.Tensor
2D tensor fo shape `(n_coords, 2)` representing the coordinates.
Returns
-------
grad : torch.Tensor
2D tensor of shape `(n_coords, 2)` representing the gradient.
"""
return torch.autograd.grad(
target, coords, grad_outputs=torch.ones_like(target), create_graph=True
)[0]
@staticmethod
def divergence(grad, coords):
"""Compute divergence.
Parameters
----------
grad : torch.Tensor
2D tensor of shape `(n_coords, 2)` representing the gradient wrt
x and y.
coords : torch.Tensor
2D tensor of shape `(n_coords, 2)` representing the coordinates.
Returns
-------
div : torch.Tensor
2D tensor of shape `(n_coords, 1)` representing the divergence.
Notes
-----
In a 2D case this will give us f_{xx} + f_{yy}.
"""
div = 0.0
for i in range(coords.shape[1]):
div += torch.autograd.grad(
grad[..., i], coords, torch.ones_like(grad[..., i]), create_graph=True,
)[0][..., i : i + 1]
return div
@staticmethod
def laplace(target, coords):
"""Compute laplace operator.
Parameters
----------
target : torch.Tensor
2D tesnor of shape `(n_coords, 1)` representing the targets.
coords : torch.Tensor
2D tensor of shape `(n_coords, 2)` representing the coordinates.
Returns
-------
torch.Tensor
2D tensor of shape `(n_coords, 1)` representing the laplace.
"""
grad = GradientUtils.gradient(target, coords)
return GradientUtils.divergence(grad, coords)
================================================
FILE: github_adventures/siren/train.py
================================================
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.nn import Linear, ReLU, Sequential
from torch.utils.data import DataLoader
import tqdm
from core import GradientUtils, ImageSiren, PixelDataset
# Image loading
img_ = plt.imread("dog.png")
downsampling_factor = 4
img = 2 * (img_ - 0.5)
img = img[::downsampling_factor, ::downsampling_factor]
size = img.shape[0]
dataset = PixelDataset(img)
# Parameters
n_epochs = 100
batch_size = int(size ** 2)
logging_freq = 20
model_name = "siren" # "siren", "mlp_relu"
hidden_features = 256
hidden_layers = 3
target = "intensity" # "intensity", "grad", "laplace"
# Model creation
if model_name == "siren":
model = ImageSiren(
hidden_features,
hidden_layers=hidden_layers,
hidden_omega=30,
)
elif model_name == "mlp_relu":
layers = [Linear(2, hidden_features), ReLU()]
for _ in range(hidden_layers):
layers.append(Linear(hidden_features, hidden_features))
layers.append(ReLU())
layers.append(Linear(hidden_features, 1))
model = Sequential(*layers)
for module in model.modules():
if not isinstance(module, Linear):
continue
torch.nn.init.xavier_normal_(module.weight)
else:
raise ValueError("Unsupported model")
dataloader = DataLoader(dataset, batch_size=batch_size)
optim = torch.optim.Adam(lr=1e-4, params=model.parameters())
# Training loop
for e in range(n_epochs):
losses = []
for d_batch in tqdm.tqdm(dataloader):
x_batch = d_batch["coords"].to(torch.float32)
x_batch.requires_grad = True
y_true_batch = d_batch["intensity"].to(torch.float32)
y_true_batch = y_true_batch[:, None]
y_pred_batch = model(x_batch)
if target == "intensity":
loss = ((y_true_batch - y_pred_batch) ** 2).mean()
elif target == "grad":
y_pred_g_batch = GradientUtils.gradient(y_pred_batch, x_batch)
y_true_g_batch = d_batch["grad"].to(torch.float32)
loss = ((y_true_g_batch - y_pred_g_batch) ** 2).mean()
elif target == "laplace":
y_pred_l_batch = GradientUtils.laplace(y_pred_batch, x_batch)
y_true_l_batch = d_batch["laplace"].to(torch.float32)[:, None]
loss = ((y_true_l_batch - y_pred_l_batch) ** 2).mean()
else:
raise ValueError("Unrecognized target")
losses.append(loss.item())
optim.zero_grad()
loss.backward()
optim.step()
print(e, np.mean(losses))
if e % logging_freq == 0:
pred_img = np.zeros_like(img)
pred_img_grad_norm = np.zeros_like(img)
pred_img_laplace = np.zeros_like(img)
orig_img = np.zeros_like(img)
for d_batch in tqdm.tqdm(dataloader):
coords = d_batch["coords"].to(torch.float32)
coords.requires_grad = True
coords_abs = d_batch["coords_abs"].numpy()
pred = model(coords)
pred_n = pred.detach().numpy().squeeze()
pred_g = (
GradientUtils.gradient(pred, coords)
.norm(dim=-1)
.detach()
.numpy()
.squeeze()
)
pred_l = GradientUtils.laplace(pred, coords).detach().numpy().squeeze()
pred_img[coords_abs[:, 0], coords_abs[:, 1]] = pred_n
pred_img_grad_norm[coords_abs[:, 0], coords_abs[:, 1]] = pred_g
pred_img_laplace[coords_abs[:, 0], coords_abs[:, 1]] = pred_l
fig, axs = plt.subplots(3, 2, constrained_layout=True)
axs[0, 0].imshow(dataset.img, cmap="gray")
axs[0, 1].imshow(pred_img, cmap="gray")
axs[1, 0].imshow(dataset.grad_norm, cmap="gray")
axs[1, 1].imshow(pred_img_grad_norm, cmap="gray")
axs[2, 0].imshow(dataset.laplace, cmap="gray")
axs[2, 1].imshow(pred_img_laplace, cmap="gray")
for row in axs:
for ax in row:
ax.set_axis_off()
fig.suptitle(f"Iteration: {e}")
axs[0, 0].set_title("Ground truth")
axs[0, 1].set_title("Prediction")
plt.savefig(f"visualization/{e}.png")
================================================
FILE: github_adventures/vision_transformer/classes.txt
================================================
tench, Tinca_tinca
goldfish, Carassius_auratus
great_white_shark, white_shark, man-eater, man-eating_shark, Carcharodon_carcharias
tiger_shark, Galeocerdo_cuvieri
hammerhead, hammerhead_shark
electric_ray, crampfish, numbfish, torpedo
stingray
cock
hen
ostrich, Struthio_camelus
brambling, Fringilla_montifringilla
goldfinch, Carduelis_carduelis
house_finch, linnet, Carpodacus_mexicanus
junco, snowbird
indigo_bunting, indigo_finch, indigo_bird, Passerina_cyanea
robin, American_robin, Turdus_migratorius
bulbul
jay
magpie
chickadee
water_ouzel, dipper
kite
bald_eagle, American_eagle, Haliaeetus_leucocephalus
vulture
great_grey_owl, great_gray_owl, Strix_nebulosa
European_fire_salamander, Salamandra_salamandra
common_newt, Triturus_vulgaris
eft
spotted_salamander, Ambystoma_maculatum
axolotl, mud_puppy, Ambystoma_mexicanum
bullfrog, Rana_catesbeiana
tree_frog, tree-frog
tailed_frog, bell_toad, ribbed_toad, tailed_toad, Ascaphus_trui
loggerhead, loggerhead_turtle, Caretta_caretta
leatherback_turtle, leatherback, leathery_turtle, Dermochelys_coriacea
mud_turtle
terrapin
box_turtle, box_tortoise
banded_gecko
common_iguana, iguana, Iguana_iguana
American_chameleon, anole, Anolis_carolinensis
whiptail, whiptail_lizard
agama
frilled_lizard, Chlamydosaurus_kingi
alligator_lizard
Gila_monster, Heloderma_suspectum
green_lizard, Lacerta_viridis
African_chameleon, Chamaeleo_chamaeleon
Komodo_dragon, Komodo_lizard, dragon_lizard, giant_lizard, Varanus_komodoensis
African_crocodile, Nile_crocodile, Crocodylus_niloticus
American_alligator, Alligator_mississipiensis
triceratops
thunder_snake, worm_snake, Carphophis_amoenus
ringneck_snake, ring-necked_snake, ring_snake
hognose_snake, puff_adder, sand_viper
green_snake, grass_snake
king_snake, kingsnake
garter_snake, grass_snake
water_snake
vine_snake
night_snake, Hypsiglena_torquata
boa_constrictor, Constrictor_constrictor
rock_python, rock_snake, Python_sebae
Indian_cobra, Naja_naja
green_mamba
sea_snake
horned_viper, cerastes, sand_viper, horned_asp, Cerastes_cornutus
diamondback, diamondback_rattlesnake, Crotalus_adamanteus
sidewinder, horned_rattlesnake, Crotalus_cerastes
trilobite
harvestman, daddy_longlegs, Phalangium_opilio
scorpion
black_and_gold_garden_spider, Argiope_aurantia
barn_spider, Araneus_cavaticus
garden_spider, Aranea_diademata
black_widow, Latrodectus_mactans
tarantula
wolf_spider, hunting_spider
tick
centipede
black_grouse
ptarmigan
ruffed_grouse, partridge, Bonasa_umbellus
prairie_chicken, prairie_grouse, prairie_fowl
peacock
quail
partridge
African_grey, African_gray, Psittacus_erithacus
macaw
sulphur-crested_cockatoo, Kakatoe_galerita, Cacatua_galerita
lorikeet
coucal
bee_eater
hornbill
hummingbird
jacamar
toucan
drake
red-breasted_merganser, Mergus_serrator
goose
black_swan, Cygnus_atratus
tusker
echidna, spiny_anteater, anteater
platypus, duckbill, duckbilled_platypus, duck-billed_platypus, Ornithorhynchus_anatinus
wallaby, brush_kangaroo
koala, koala_bear, kangaroo_bear, native_bear, Phascolarctos_cinereus
wombat
jellyfish
sea_anemone, anemone
brain_coral
flatworm, platyhelminth
nematode, nematode_worm, roundworm
conch
snail
slug
sea_slug, nudibranch
chiton, coat-of-mail_shell, sea_cradle, polyplacophore
chambered_nautilus, pearly_nautilus, nautilus
Dungeness_crab, Cancer_magister
rock_crab, Cancer_irroratus
fiddler_crab
king_crab, Alaska_crab, Alaskan_king_crab, Alaska_king_crab, Paralithodes_camtschatica
American_lobster, Northern_lobster, Maine_lobster, Homarus_americanus
spiny_lobster, langouste, rock_lobster, crawfish, crayfish, sea_crawfish
crayfish, crawfish, crawdad, crawdaddy
hermit_crab
isopod
white_stork, Ciconia_ciconia
black_stork, Ciconia_nigra
spoonbill
flamingo
little_blue_heron, Egretta_caerulea
American_egret, great_white_heron, Egretta_albus
bittern
crane
limpkin, Aramus_pictus
European_gallinule, Porphyrio_porphyrio
American_coot, marsh_hen, mud_hen, water_hen, Fulica_americana
bustard
ruddy_turnstone, Arenaria_interpres
red-backed_sandpiper, dunlin, Erolia_alpina
redshank, Tringa_totanus
dowitcher
oystercatcher, oyster_catcher
pelican
king_penguin, Aptenodytes_patagonica
albatross, mollymawk
grey_whale, gray_whale, devilfish, Eschrichtius_gibbosus, Eschrichtius_robustus
killer_whale, killer, orca, grampus, sea_wolf, Orcinus_orca
dugong, Dugong_dugon
sea_lion
Chihuahua
Japanese_spaniel
Maltese_dog, Maltese_terrier, Maltese
Pekinese, Pekingese, Peke
Shih-Tzu
Blenheim_spaniel
papillon
toy_terrier
Rhodesian_ridgeback
Afghan_hound, Afghan
basset, basset_hound
beagle
bloodhound, sleuthhound
bluetick
black-and-tan_coonhound
Walker_hound, Walker_foxhound
English_foxhound
redbone
borzoi, Russian_wolfhound
Irish_wolfhound
Italian_greyhound
whippet
Ibizan_hound, Ibizan_Podenco
Norwegian_elkhound, elkhound
otterhound, otter_hound
Saluki, gazelle_hound
Scottish_deerhound, deerhound
Weimaraner
Staffordshire_bullterrier, Staffordshire_bull_terrier
American_Staffordshire_terrier, Staffordshire_terrier, American_pit_bull_terrier, pit_bull_terrier
Bedlington_terrier
Border_terrier
Kerry_blue_terrier
Irish_terrier
Norfolk_terrier
Norwich_terrier
Yorkshire_terrier
wire-haired_fox_terrier
Lakeland_terrier
Sealyham_terrier, Sealyham
Airedale, Airedale_terrier
cairn, cairn_terrier
Australian_terrier
Dandie_Dinmont, Dandie_Dinmont_terrier
Boston_bull, Boston_terrier
miniature_schnauzer
giant_schnauzer
standard_schnauzer
Scotch_terrier, Scottish_terrier, Scottie
Tibetan_terrier, chrysanthemum_dog
silky_terrier, Sydney_silky
soft-coated_wheaten_terrier
West_Highland_white_terrier
Lhasa, Lhasa_apso
flat-coated_retriever
curly-coated_retriever
golden_retriever
Labrador_retriever
Chesapeake_Bay_retriever
German_short-haired_pointer
vizsla, Hungarian_pointer
English_setter
Irish_setter, red_setter
Gordon_setter
Brittany_spaniel
clumber, clumber_spaniel
English_springer, English_springer_spaniel
Welsh_springer_spaniel
cocker_spaniel, English_cocker_spaniel, cocker
Sussex_spaniel
Irish_water_spaniel
kuvasz
schipperke
groenendael
malinois
briard
kelpie
komondor
Old_English_sheepdog, bobtail
Shetland_sheepdog, Shetland_sheep_dog, Shetland
collie
Border_collie
Bouvier_des_Flandres, Bouviers_des_Flandres
Rottweiler
German_shepherd, German_shepherd_dog, German_police_dog, alsatian
Doberman, Doberman_pinscher
miniature_pinscher
Greater_Swiss_Mountain_dog
Bernese_mountain_dog
Appenzeller
EntleBucher
boxer
bull_mastiff
Tibetan_mastiff
French_bulldog
Great_Dane
Saint_Bernard, St_Bernard
Eskimo_dog, husky
malamute, malemute, Alaskan_malamute
Siberian_husky
dalmatian, coach_dog, carriage_dog
affenpinscher, monkey_pinscher, monkey_dog
basenji
pug, pug-dog
Leonberg
Newfoundland, Newfoundland_dog
Great_Pyrenees
Samoyed, Samoyede
Pomeranian
chow, chow_chow
keeshond
Brabancon_griffon
Pembroke, Pembroke_Welsh_corgi
Cardigan, Cardigan_Welsh_corgi
toy_poodle
miniature_poodle
standard_poodle
Mexican_hairless
timber_wolf, grey_wolf, gray_wolf, Canis_lupus
white_wolf, Arctic_wolf, Canis_lupus_tundrarum
red_wolf, maned_wolf, Canis_rufus, Canis_niger
coyote, prairie_wolf, brush_wolf, Canis_latrans
dingo, warrigal, warragal, Canis_dingo
dhole, Cuon_alpinus
African_hunting_dog, hyena_dog, Cape_hunting_dog, Lycaon_pictus
hyena, hyaena
red_fox, Vulpes_vulpes
kit_fox, Vulpes_macrotis
Arctic_fox, white_fox, Alopex_lagopus
grey_fox, gray_fox, Urocyon_cinereoargenteus
tabby, tabby_cat
tiger_cat
Persian_cat
Siamese_cat, Siamese
Egyptian_cat
cougar, puma, catamount, mountain_lion, painter, panther, Felis_concolor
lynx, catamount
leopard, Panthera_pardus
snow_leopard, ounce, Panthera_uncia
jaguar, panther, Panthera_onca, Felis_onca
lion, king_of_beasts, Panthera_leo
tiger, Panthera_tigris
cheetah, chetah, Acinonyx_jubatus
brown_bear, bruin, Ursus_arctos
American_black_bear, black_bear, Ursus_americanus, Euarctos_americanus
ice_bear, polar_bear, Ursus_Maritimus, Thalarctos_maritimus
sloth_bear, Melursus_ursinus, Ursus_ursinus
mongoose
meerkat, mierkat
tiger_beetle
ladybug, ladybeetle, lady_beetle, ladybird, ladybird_beetle
ground_beetle, carabid_beetle
long-horned_beetle, longicorn, longicorn_beetle
leaf_beetle, chrysomelid
dung_beetle
rhinoceros_beetle
weevil
fly
bee
ant, emmet, pismire
grasshopper, hopper
cricket
walking_stick, walkingstick, stick_insect
cockroach, roach
mantis, mantid
cicada, cicala
leafhopper
lacewing, lacewing_fly
dragonfly, darning_needle, devil's_darning_needle, sewing_needle, snake_feeder, snake_doctor, mosquito_hawk, skeeter_hawk
damselfly
admiral
ringlet, ringlet_butterfly
monarch, monarch_butterfly, milkweed_butterfly, Danaus_plexippus
cabbage_butterfly
sulphur_butterfly, sulfur_butterfly
lycaenid, lycaenid_butterfly
starfish, sea_star
sea_urchin
sea_cucumber, holothurian
wood_rabbit, cottontail, cottontail_rabbit
hare
Angora, Angora_rabbit
hamster
porcupine, hedgehog
fox_squirrel, eastern_fox_squirrel, Sciurus_niger
marmot
beaver
guinea_pig, Cavia_cobaya
sorrel
zebra
hog, pig, grunter, squealer, Sus_scrofa
wild_boar, boar, Sus_scrofa
warthog
hippopotamus, hippo, river_horse, Hippopotamus_amphibius
ox
water_buffalo, water_ox, Asiatic_buffalo, Bubalus_bubalis
bison
ram, tup
bighorn, bighorn_sheep, cimarron, Rocky_Mountain_bighorn, Rocky_Mountain_sheep, Ovis_canadensis
ibex, Capra_ibex
hartebeest
impala, Aepyceros_melampus
gazelle
Arabian_camel, dromedary, Camelus_dromedarius
llama
weasel
mink
polecat, fitch, foulmart, foumart, Mustela_putorius
black-footed_ferret, ferret, Mustela_nigripes
otter
skunk, polecat, wood_pussy
badger
armadillo
three-toed_sloth, ai, Bradypus_tridactylus
orangutan, orang, orangutang, Pongo_pygmaeus
gorilla, Gorilla_gorilla
chimpanzee, chimp, Pan_troglodytes
gibbon, Hylobates_lar
siamang, Hylobates_syndactylus, Symphalangus_syndactylus
guenon, guenon_monkey
patas, hussar_monkey, Erythrocebus_patas
baboon
macaque
langur
colobus, colobus_monkey
proboscis_monkey, Nasalis_larvatus
marmoset
capuchin, ringtail, Cebus_capucinus
howler_monkey, howler
titi, titi_monkey
spider_monkey, Ateles_geoffroyi
squirrel_monkey, Saimiri_sciureus
Madagascar_cat, ring-tailed_lemur, Lemur_catta
indri, indris, Indri_indri, Indri_brevicaudatus
Indian_elephant, Elephas_maximus
African_elephant, Loxodonta_africana
lesser_panda, red_panda, panda, bear_cat, cat_bear, Ailurus_fulgens
giant_panda, panda, panda_bear, coon_bear, Ailuropoda_melanoleuca
barracouta, snoek
eel
coho, cohoe, coho_salmon, blue_jack, silver_salmon, Oncorhynchus_kisutch
rock_beauty, Holocanthus_tricolor
anemone_fish
sturgeon
gar, garfish, garpike, billfish, Lepisosteus_osseus
lionfish
puffer, pufferfish, blowfish, globefish
abacus
abaya
academic_gown, academic_robe, judge's_robe
accordion, piano_accordion, squeeze_box
acoustic_guitar
aircraft_carrier, carrier, flattop, attack_aircraft_carrier
airliner
airship, dirigible
altar
ambulance
amphibian, amphibious_vehicle
analog_clock
apiary, bee_house
apron
ashcan, trash_can, garbage_can, wastebin, ash_bin, ash-bin, ashbin, dustbin, trash_barrel, trash_bin
assault_rifle, assault_gun
backpack, back_pack, knapsack, packsack, rucksack, haversack
bakery, bakeshop, bakehouse
balance_beam, beam
balloon
ballpoint, ballpoint_pen, ballpen, Biro
Band_Aid
banjo
bannister, banister, balustrade, balusters, handrail
barbell
barber_chair
barbershop
barn
barometer
barrel, cask
barrow, garden_cart, lawn_cart, wheelbarrow
baseball
basketball
bassinet
bassoon
bathing_cap, swimming_cap
bath_towel
bathtub, bathing_tub, bath, tub
beach_wagon, station_wagon, wagon, estate_car, beach_waggon, station_waggon, waggon
beacon, lighthouse, beacon_light, pharos
beaker
bearskin, busby, shako
beer_bottle
beer_glass
bell_cote, bell_cot
bib
bicycle-built-for-two, tandem_bicycle, tandem
bikini, two-piece
binder, ring-binder
binoculars, field_glasses, opera_glasses
birdhouse
boathouse
bobsled, bobsleigh, bob
bolo_tie, bolo, bola_tie, bola
bonnet, poke_bonnet
bookcase
bookshop, bookstore, bookstall
bottlecap
bow
bow_tie, bow-tie, bowtie
brass, memorial_tablet, plaque
brassiere, bra, bandeau
breakwater, groin, groyne, mole, bulwark, seawall, jetty
breastplate, aegis, egis
broom
bucket, pail
buckle
bulletproof_vest
bullet_train, bullet
butcher_shop, meat_market
cab, hack, taxi, taxicab
caldron, cauldron
candle, taper, wax_light
cannon
canoe
can_opener, tin_opener
cardigan
car_mirror
carousel, carrousel, merry-go-round, roundabout, whirligig
carpenter's_kit, tool_kit
carton
car_wheel
cash_machine, cash_dispenser, automated_teller_machine, automatic_teller_machine, automated_teller, automatic_teller, ATM
cassette
cassette_player
castle
catamaran
CD_player
cello, violoncello
cellular_telephone, cellular_phone, cellphone, cell, mobile_phone
chain
chainlink_fence
chain_mail, ring_mail, mail, chain_armor, chain_armour, ring_armor, ring_armour
chain_saw, chainsaw
chest
chiffonier, commode
chime, bell, gong
china_cabinet, china_closet
Christmas_stocking
church, church_building
cinema, movie_theater, movie_theatre, movie_house, picture_palace
cleaver, meat_cleaver, chopper
cliff_dwelling
cloak
clog, geta, patten, sabot
cocktail_shaker
coffee_mug
coffeepot
coil, spiral, volute, whorl, helix
combination_lock
computer_keyboard, keypad
confectionery, confectionary, candy_store
container_ship, containership, container_vessel
convertible
corkscrew, bottle_screw
cornet, horn, trumpet, trump
cowboy_boot
cowboy_hat, ten-gallon_hat
cradle
crane
crash_helmet
crate
crib, cot
Crock_Pot
croquet_ball
crutch
cuirass
dam, dike, dyke
desk
desktop_computer
dial_telephone, dial_phone
diaper, nappy, napkin
digital_clock
digital_watch
dining_table, board
dishrag, dishcloth
dishwasher, dish_washer, dishwashing_machine
disk_brake, disc_brake
dock, dockage, docking_facility
dogsled, dog_sled, dog_sleigh
dome
doormat, welcome_mat
drilling_platform, offshore_rig
drum, membranophone, tympan
drumstick
dumbbell
Dutch_oven
electric_fan, blower
electric_guitar
electric_locomotive
entertainment_center
envelope
espresso_maker
face_powder
feather_boa, boa
file, file_cabinet, filing_cabinet
fireboat
fire_engine, fire_truck
fire_screen, fireguard
flagpole, flagstaff
flute, transverse_flute
folding_chair
football_helmet
forklift
fountain
fountain_pen
four-poster
freight_car
French_horn, horn
frying_pan, frypan, skillet
fur_coat
garbage_truck, dustcart
gasmask, respirator, gas_helmet
gas_pump, gasoline_pump, petrol_pump, island_dispenser
goblet
go-kart
golf_ball
golfcart, golf_cart
gondola
gong, tam-tam
gown
grand_piano, grand
greenhouse, nursery, glasshouse
grille, radiator_grille
grocery_store, grocery, food_market, market
guillotine
hair_slide
hair_spray
half_track
hammer
hamper
hand_blower, blow_dryer, blow_drier, hair_dryer, hair_drier
hand-held_computer, hand-held_microcomputer
handkerchief, hankie, hanky, hankey
hard_disc, hard_disk, fixed_disk
harmonica, mouth_organ, harp, mouth_harp
harp
harvester, reaper
hatchet
holster
home_theater, home_theatre
honeycomb
hook, claw
hoopskirt, crinoline
horizontal_bar, high_bar
horse_cart, horse-cart
hourglass
iPod
iron, smoothing_iron
jack-o'-lantern
jean, blue_jean, denim
jeep, landrover
jersey, T-shirt, tee_shirt
jigsaw_puzzle
jinrikisha, ricksha, rickshaw
joystick
kimono
knee_pad
knot
lab_coat, laboratory_coat
ladle
lampshade, lamp_shade
laptop, laptop_computer
lawn_mower, mower
lens_cap, lens_cover
letter_opener, paper_knife, paperknife
library
lifeboat
lighter, light, igniter, ignitor
limousine, limo
liner, ocean_liner
lipstick, lip_rouge
Loafer
lotion
loudspeaker, speaker, speaker_unit, loudspeaker_system, speaker_system
loupe, jeweler's_loupe
lumbermill, sawmill
magnetic_compass
mailbag, postbag
mailbox, letter_box
maillot
maillot, tank_suit
manhole_cover
maraca
marimba, xylophone
mask
matchstick
maypole
maze, labyrinth
measuring_cup
medicine_chest, medicine_cabinet
megalith, megalithic_structure
microphone, mike
microwave, microwave_oven
military_uniform
milk_can
minibus
miniskirt, mini
minivan
missile
mitten
mixing_bowl
mobile_home, manufactured_home
Model_T
modem
monastery
monitor
moped
mortar
mortarboard
mosque
mosquito_net
motor_scooter, scooter
mountain_bike, all-terrain_bike, off-roader
mountain_tent
mouse, computer_mouse
mousetrap
moving_van
muzzle
nail
neck_brace
necklace
nipple
notebook, notebook_computer
obelisk
oboe, hautboy, hautbois
ocarina, sweet_potato
odometer, hodometer, mileometer, milometer
oil_filter
organ, pipe_organ
oscilloscope, scope, cathode-ray_oscilloscope, CRO
overskirt
oxcart
oxygen_mask
packet
paddle, boat_paddle
paddlewheel, paddle_wheel
padlock
paintbrush
pajama, pyjama, pj's, jammies
palace
panpipe, pandean_pipe, syrinx
paper_towel
parachute, chute
parallel_bars, bars
park_bench
parking_meter
passenger_car, coach, carriage
patio, terrace
pay-phone, pay-station
pedestal, plinth, footstall
pencil_box, pencil_case
pencil_sharpener
perfume, essence
Petri_dish
photocopier
pick, plectrum, plectron
pickelhaube
picket_fence, paling
pickup, pickup_truck
pier
piggy_bank, penny_bank
pill_bottle
pillow
ping-pong_ball
pinwheel
pirate, pirate_ship
pitcher, ewer
plane, carpenter's_plane, woodworking_plane
planetarium
plastic_bag
plate_rack
plow, plough
plunger, plumber's_helper
Polaroid_camera, Polaroid_Land_camera
pole
police_van, police_wagon, paddy_wagon, patrol_wagon, wagon, black_Maria
poncho
pool_table, billiard_table, snooker_table
pop_bottle, soda_bottle
pot, flowerpot
potter's_wheel
power_drill
prayer_rug, prayer_mat
printer
prison, prison_house
projectile, missile
projector
puck, hockey_puck
punching_bag, punch_bag, punching_ball, punchball
purse
quill, quill_pen
quilt, comforter, comfort, puff
racer, race_car, racing_car
racket, racquet
radiator
radio, wireless
radio_telescope, radio_reflector
rain_barrel
recreational_vehicle, RV, R.V.
reel
reflex_camera
refrigerator, icebox
remote_control, remote
restaurant, eating_house, eating_place, eatery
revolver, six-gun, six-shooter
rifle
rocking_chair, rocker
rotisserie
rubber_eraser, rubber, pencil_eraser
rugby_ball
rule, ruler
running_shoe
safe
safety_pin
saltshaker, salt_shaker
sandal
sarong
sax, saxophone
scabbard
scale, weighing_machine
school_bus
schooner
scoreboard
screen, CRT_screen
screw
screwdriver
seat_belt, seatbelt
sewing_machine
shield, buckler
shoe_shop, shoe-shop, shoe_store
shoji
shopping_basket
shopping_cart
shovel
shower_cap
shower_curtain
ski
ski_mask
sleeping_bag
slide_rule, slipstick
sliding_door
slot, one-armed_bandit
snorkel
snowmobile
snowplow, snowplough
soap_dispenser
soccer_ball
sock
solar_dish, solar_collector, solar_furnace
sombrero
soup_bowl
space_bar
space_heater
space_shuttle
spatula
speedboat
spider_web, spider's_web
spindle
sports_car, sport_car
spotlight, spot
stage
steam_locomotive
steel_arch_bridge
steel_drum
stethoscope
stole
stone_wall
stopwatch, stop_watch
stove
strainer
streetcar, tram, tramcar, trolley, trolley_car
stretcher
studio_couch, day_bed
stupa, tope
submarine, pigboat, sub, U-boat
suit, suit_of_clothes
sundial
sunglass
sunglasses, dark_glasses, shades
sunscreen, sunblock, sun_blocker
suspension_bridge
swab, swob, mop
sweatshirt
swimming_trunks, bathing_trunks
swing
switch, electric_switch, electrical_switch
syringe
table_lamp
tank, army_tank, armored_combat_vehicle, armoured_combat_vehicle
tape_player
teapot
teddy, teddy_bear
television, television_system
tennis_ball
thatch, thatched_roof
theater_curtain, theatre_curtain
thimble
thresher, thrasher, threshing_machine
throne
tile_roof
toaster
tobacco_shop, tobacconist_shop, tobacconist
toilet_seat
torch
totem_pole
tow_truck, tow_car, wrecker
toyshop
tractor
trailer_truck, tractor_trailer, trucking_rig, rig, articulated_lorry, semi
tray
trench_coat
tricycle, trike, velocipede
trimaran
tripod
triumphal_arch
trolleybus, trolley_coach, trackless_trolley
trombone
tub, vat
turnstile
typewriter_keyboard
umbrella
unicycle, monocycle
upright, upright_piano
vacuum, vacuum_cleaner
vase
vault
velvet
vending_machine
vestment
viaduct
violin, fiddle
volleyball
waffle_iron
wall_clock
wallet, billfold, notecase, pocketbook
wardrobe, closet, press
warplane, military_plane
washbasin, handbasin, washbowl, lavabo, wash-hand_basin
washer, automatic_washer, washing_machine
water_bottle
water_jug
water_tower
whiskey_jug
whistle
wig
window_screen
window_shade
Windsor_tie
wine_bottle
wing
wok
wooden_spoon
wool, woolen, woollen
worm_fence, snake_fence, snake-rail_fence, Virginia_fence
wreck
yawl
yurt
web_site, website, internet_site, site
comic_book
crossword_puzzle, crossword
street_sign
traffic_light, traffic_signal, stoplight
book_jacket, dust_cover, dust_jacket, dust_wrapper
menu
plate
guacamole
consomme
hot_pot, hotpot
trifle
ice_cream, icecream
ice_lolly, lolly, lollipop, popsicle
French_loaf
bagel, beigel
pretzel
cheeseburger
hotdog, hot_dog, red_hot
mashed_potato
head_cabbage
broccoli
cauliflower
zucchini, courgette
spaghetti_squash
acorn_squash
butternut_squash
cucumber, cuke
artichoke, globe_artichoke
bell_pepper
cardoon
mushroom
Granny_Smith
strawberry
orange
lemon
fig
pineapple, ananas
banana
jackfruit, jak, jack
custard_apple
pomegranate
hay
carbonara
chocolate_sauce, chocolate_syrup
dough
meat_loaf, meatloaf
pizza, pizza_pie
potpie
burrito
red_wine
espresso
cup
eggnog
alp
bubble
cliff, drop, drop-off
coral_reef
geyser
lakeside, lakeshore
promontory, headland, head, foreland
sandbar, sand_bar
seashore, coast, seacoast, sea-coast
valley, vale
volcano
ballplayer, baseball_player
groom, bridegroom
scuba_diver
rapeseed
daisy
yellow_lady's_slipper, yellow_lady-slipper, Cypripedium_calceolus, Cypripedium_parviflorum
corn
acorn
hip, rose_hip, rosehip
buckeye, horse_chestnut, conker
coral_fungus
agaric
gyromitra
stinkhorn, carrion_fungus
earthstar
hen-of-the-woods, hen_of_the_woods, Polyporus_frondosus, Grifola_frondosa
bolete
ear, spike, capitulum
toilet_tissue, toilet_paper, bathroom_tissue
================================================
FILE: github_adventures/vision_transformer/custom.py
================================================
import torch
import torch.nn as nn
class PatchEmbed(nn.Module):
"""Split image into patches and then embed them.
Parameters
----------
img_size : int
Size of the image (it is a square).
patch_size : int
Size of the patch (it is a square).
in_chans : int
Number of input channels.
embed_dim : int
The emmbedding dimension.
Attributes
----------
n_patches : int
Number of patches inside of our image.
proj : nn.Conv2d
Convolutional layer that does both the splitting into patches
and their embedding.
"""
def __init__(self, img_size, patch_size, in_chans=3, embed_dim=768):
super().__init__()
self.img_size = img_size
self.patch_size = patch_size
self.n_patches = (img_size // patch_size) ** 2
self.proj = nn.Conv2d(
in_chans,
embed_dim,
kernel_size=patch_size,
stride=patch_size,
)
def forward(self, x):
"""Run forward pass.
Parameters
----------
x : torch.Tensor
Shape `(n_samples, in_chans, img_size, img_size)`.
Returns
-------
torch.Tensor
Shape `(n_samples, n_patches, embed_dim)`.
"""
x = self.proj(
x
) # (n_samples, embed_dim, n_patches ** 0.5, n_patches ** 0.5)
x = x.flatten(2) # (n_samples, embed_dim, n_patches)
x = x.transpose(1, 2) # (n_samples, n_patches, embed_dim)
return x
class Attention(nn.Module):
"""Attention mechanism.
Parameters
----------
dim : int
The input and out dimension of per token features.
n_heads : int
Number of attention heads.
qkv_bias : bool
If True then we include bias to the query, key and value projections.
attn_p : float
Dropout probability applied to the query, key and value tensors.
proj_p : float
Dropout probability applied to the output tensor.
Attributes
----------
scale : float
Normalizing consant for the dot product.
qkv : nn.Linear
Linear projection for the query, key and value.
proj : nn.Linear
Linear mapping that takes in the concatenated output of all attention
heads and maps it into a new space.
attn_drop, proj_drop : nn.Dropout
Dropout layers.
"""
def __init__(self, dim, n_heads=12, qkv_bias=True, attn_p=0., proj_p=0.):
super().__init__()
self.n_heads = n_heads
self.dim = dim
self.head_dim = dim // n_heads
self.scale = self.head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_p)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_p)
def forward(self, x):
"""Run forward pass.
Parameters
----------
x : torch.Tensor
Shape `(n_samples, n_patches + 1, dim)`.
Returns
-------
torch.Tensor
Shape `(n_samples, n_patches + 1, dim)`.
"""
n_samples, n_tokens, dim = x.shape
if dim != self.dim:
raise ValueError
qkv = self.qkv(x) # (n_samples, n_patches + 1, 3 * dim)
qkv = qkv.reshape(
n_samples, n_tokens, 3, self.n_heads, self.head_dim
) # (n_smaples, n_patches + 1, 3, n_heads, head_dim)
qkv = qkv.permute(
2, 0, 3, 1, 4
) # (3, n_samples, n_heads, n_patches + 1, head_dim)
q, k, v = qkv[0], qkv[1], qkv[2]
k_t = k.transpose(-2, -1) # (n_samples, n_heads, head_dim, n_patches + 1)
dp = (
q @ k_t
) * self.scale # (n_samples, n_heads, n_patches + 1, n_patches + 1)
attn = dp.softmax(dim=-1) # (n_samples, n_heads, n_patches + 1, n_patches + 1)
attn = self.attn_drop(attn)
weighted_avg = attn @ v # (n_samples, n_heads, n_patches +1, head_dim)
weighted_avg = weighted_avg.transpose(
1, 2
) # (n_samples, n_patches + 1, n_heads, head_dim)
weighted_avg = weighted_avg.flatten(2) # (n_samples, n_patches + 1, dim)
x = self.proj(weighted_avg) # (n_samples, n_patches + 1, dim)
x = self.proj_drop(x) # (n_samples, n_patches + 1, dim)
return x
class MLP(nn.Module):
"""Multilayer perceptron.
Parameters
----------
in_features : int
Number of input features.
hidden_features : int
Number of nodes in the hidden layer.
out_features : int
Number of output features.
p : float
Dropout probability.
Attributes
----------
fc : nn.Linear
The First linear layer.
act : nn.GELU
GELU activation function.
fc2 : nn.Linear
The second linear layer.
drop : nn.Dropout
Dropout layer.
"""
def __init__(self, in_features, hidden_features, out_features, p=0.):
super().__init__()
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = nn.GELU()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(p)
def forward(self, x):
"""Run forward pass.
Parameters
----------
x : torch.Tensor
Shape `(n_samples, n_patches + 1, in_features)`.
Returns
-------
torch.Tensor
Shape `(n_samples, n_patches +1, out_features)`
"""
x = self.fc1(
x
) # (n_samples, n_patches + 1, hidden_features)
x = self.act(x) # (n_samples, n_patches + 1, hidden_features)
x = self.drop(x) # (n_samples, n_patches + 1, hidden_features)
x = self.fc2(x) # (n_samples, n_patches + 1, out_features)
x = self.drop(x) # (n_samples, n_patches + 1, out_features)
return x
class Block(nn.Module):
"""Transformer block.
Parameters
----------
dim : int
Embeddinig dimension.
n_heads : int
Number of attention heads.
mlp_ratio : float
Determines the hidden dimension size of the `MLP` module with respect
to `dim`.
qkv_bias : bool
If True then we include bias to the query, key and value projections.
p, attn_p : float
Dropout probability.
Attributes
----------
norm1, norm2 : LayerNorm
Layer normalization.
attn : Attention
Attention module.
mlp : MLP
MLP module.
"""
def __init__(self, dim, n_heads, mlp_ratio=4.0, qkv_bias=True, p=0., attn_p=0.):
super().__init__()
self.norm1 = nn.LayerNorm(dim, eps=1e-6)
self.attn = Attention(
dim,
n_heads=n_heads,
qkv_bias=qkv_bias,
attn_p=attn_p,
proj_p=p
)
self.norm2 = nn.LayerNorm(dim, eps=1e-6)
hidden_features = int(dim * mlp_ratio)
self.mlp = MLP(
in_features=dim,
hidden_features=hidden_features,
out_features=dim,
)
def forward(self, x):
"""Run forward pass.
Parameters
----------
x : torch.Tensor
Shape `(n_samples, n_patches + 1, dim)`.
Returns
-------
torch.Tensor
Shape `(n_samples, n_patches + 1, dim)`.
"""
x = x + self.attn(self.norm1(x))
x = x + self.mlp(self.norm2(x))
return x
class VisionTransformer(nn.Module):
"""Simplified implementation of the Vision transformer.
Parameters
----------
img_size : int
Both height and the width of the image (it is a square).
patch_size : int
Both height and the width of the patch (it is a square).
in_chans : int
Number of input channels.
n_classes : int
Number of classes.
embed_dim : int
Dimensionality of the token/patch embeddings.
depth : int
Number of blocks.
n_heads : int
Number of attention heads.
mlp_ratio : float
Determines the hidden dimension of the `MLP` module.
qkv_bias : bool
If True then we include bias to the query, key and value projections.
p, attn_p : float
Dropout probability.
Attributes
----------
patch_embed : PatchEmbed
Instance of `PatchEmbed` layer.
cls_token : nn.Parameter
Learnable parameter that will represent the first token in the sequence.
It has `embed_dim` elements.
pos_emb : nn.Parameter
Positional embedding of the cls token + all the patches.
It has `(n_patches + 1) * embed_dim` elements.
pos_drop : nn.Dropout
Dropout layer.
blocks : nn.ModuleList
List of `Block` modules.
norm : nn.LayerNorm
Layer normalization.
"""
def __init__(
self,
img_size=384,
patch_size=16,
in_chans=3,
n_classes=1000,
embed_dim=768,
depth=12,
n_heads=12,
mlp_ratio=4.,
qkv_bias=True,
p=0.,
attn_p=0.,
):
super().__init__()
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
)
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_embed = nn.Parameter(
torch.zeros(1, 1 + self.patch_embed.n_patches, embed_dim)
)
self.pos_drop = nn.Dropout(p=p)
self.blocks = nn.ModuleList(
[
Block(
dim=embed_dim,
n_heads=n_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
p=p,
attn_p=attn_p,
)
for _ in range(depth)
]
)
self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
self.head = nn.Linear(embed_dim, n_classes)
def forward(self, x):
"""Run the forward pass.
Parameters
----------
x : torch.Tensor
Shape `(n_samples, in_chans, img_size, img_size)`.
Returns
-------
logits : torch.Tensor
Logits over all the classes - `(n_samples, n_classes)`.
"""
n_samples = x.shape[0]
x = self.patch_embed(x)
cls_token = self.cls_token.expand(
n_samples, -1, -1
) # (n_samples, 1, embed_dim)
x = torch.cat((cls_token, x), dim=1) # (n_samples, 1 + n_patches, embed_dim)
x = x + self.pos_embed # (n_samples, 1 + n_patches, embed_dim)
x = self.pos_drop(x)
for block in self.blocks:
x = block(x)
x = self.norm(x)
cls_token_final = x[:, 0] # just the CLS token
x = self.head(cls_token_final)
return x
================================================
FILE: github_adventures/vision_transformer/forward.py
================================================
import numpy as np
from PIL import Image
import torch
k = 10
imagenet_labels = dict(enumerate(open("classes.txt")))
model = torch.load("model.pth")
model.eval()
img = (np.array(Image.open("cat.png")) / 128) - 1 # in the range -1, 1
inp = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).to(torch.float32)
logits = model(inp)
probs = torch.nn.functional.softmax(logits, dim=-1)
top_probs, top_ixs = probs[0].topk(k)
for i, (ix_, prob_) in enumerate(zip(top_ixs, top_probs)):
ix = ix_.item()
prob = prob_.item()
cls = imagenet_labels[ix].strip()
print(f"{i}: {cls:<45} --- {prob:.4f}")
================================================
FILE: github_adventures/vision_transformer/verify.py
================================================
import numpy as np
import timm
import torch
from custom import VisionTransformer
# Helpers
def get_n_params(module):
return sum(p.numel() for p in module.parameters() if p.requires_grad)
def assert_tensors_equal(t1, t2):
a1, a2 = t1.detach().numpy(), t2.detach().numpy()
np.testing.assert_allclose(a1, a2)
model_name = "vit_base_patch16_384"
model_official = timm.create_model(model_name, pretrained=True)
model_official.eval()
print(type(model_official))
custom_config = {
"img_size": 384,
"in_chans": 3,
"patch_size": 16,
"embed_dim": 768,
"depth": 12,
"n_heads": 12,
"qkv_bias": True,
"mlp_ratio": 4,
}
model_custom = VisionTransformer(**custom_config)
model_custom.eval()
for (n_o, p_o), (n_c, p_c) in zip(
model_official.named_parameters(), model_custom.named_parameters()
):
assert p_o.numel() == p_c.numel()
print(f"{n_o} | {n_c}")
p_c.data[:] = p_o.data
assert_tensors_equal(p_c.data, p_o.data)
inp = torch.rand(1, 3, 384, 384)
res_c = model_custom(inp)
res_o = model_official(inp)
# Asserts
assert get_n_params(model_custom) == get_n_params(model_official)
assert_tensors_equal(res_c, res_o)
# Save custom model
torch.save(model_custom, "model.pth")
================================================
FILE: mini_tutorials/bentoml/README.md
================================================
1. [Resources](#resources)
2. [Installation](#installation)
3. [Instructions](#instructions)
1. [`bentoml`](#bentoml)
1. [`bentoctl`](#bentoctl)
1. [`aws` CLI](#aws-cli)
4. [Sketches](#sketches)
# Resources
* https://docs.bentoml.com/en/latest/
* https://github.com/bentoml/bentoctl
* https://github.com/bentoml/aws-sagemaker-deploy
# Installation
```bash
pip install -r requirements.txt
```
See below the actual versions at the time of making the video
```txt
bentoctl==0.4.0
bentoml==1.1.9
boto3==1.29.0
numpy==1.26.2
pydantic==2.5.1
pydantic_core==2.14.3
scikit-learn==1.3.2
```
# Instructions
## `bentoml`
Creating a model
```bash
python create_model.py
```
Listing all existing models
```bash
bentoml models list
```
Build a bento
```bash
bentoml build
```
List all existing bentos
```bash
bentoml list
```
Serve a bento locally
```bash
bentoml serve $BENTO
```
Serve a `service.py` (development)
```bash
bentoml serve service.py
```
## `bentoctl`
Install SageMaker operator
```bash
bentoctl operator install aws-sagemaker
```
Initialize
```bash
bentoctl init
```
ATTENTION: All of the below assumes that you have correctly set up AWS
secret keys and permissions.
Build custom customized SageMaker image and push to ECR
```bash
bentoctl build -f deployment_config.yaml -b $BENTO
```
Initialize terraform
```bash
terraform init
```
Look at what changes will be applied
```bash
terraform plan -var-file=bentoctl.tfvars
```
Actually apply changes
```bash
terraform apply -var-file=bentoctl.tfvars
```
Send request to the API Gateway
```bash
curl -X 'POST' "$URL/classify" -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
"sepal_width": 0,
"sepal_length": 0,
"petal_width": 0,
"petal_length": 0
}'
```
Destroy resources (not including ECR)
```bash
terraform destroy -var-file=bentoctl.tfvars
```
Destroy resources including ECR)
```bash
bentoctl destroy
```
## `aws` CLI
Describe repositories
```bash
aws ecr describe-repositories
```
List all images in the repository `amazing-iris`
```bash
aws ecr list-images --repository-name=amazing-iris
```
List SageMaker models
```bash
aws sagemaker list-models
```
List SageMaker endpoints
```bash
aws sagemaker list-endpoints
```
# Sketches
================================================
FILE: mini_tutorials/bentoml/bentofile.yaml
================================================
service: "service:svc"
include:
- "service.py"
python:
packages:
- pydantic
- scikit-learn
models:
- iris_clf:latest
================================================
FILE: mini_tutorials/bentoml/create_model.py
================================================
import bentoml
from sklearn import datasets
from sklearn import svm
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf = svm.SVC(gamma="scale")
clf.fit(X, y)
saved_model = bentoml.sklearn.save_model("iris_clf", clf)
print(saved_model)
================================================
FILE: mini_tutorials/bentoml/requirements.txt
================================================
bentoctl
bentoml
boto3
numpy
pydantic
scikit-learn
================================================
FILE: mini_tutorials/bentoml/service.py
================================================
from typing import Literal
import bentoml
from pydantic import BaseModel
from bentoml.io import JSON
iris_clf_runner = bentoml.sklearn.get("iris_clf:latest").to_runner()
svc = bentoml.Service("iris_classifier", runners=[iris_clf_runner])
class Request(BaseModel):
sepal_width: float
sepal_length: float
petal_width: float
petal_length: float
class Response(BaseModel):
label: Literal["setosa", "versicolor", "virginica"]
@svc.api(input=JSON(pydantic_model=Request), output=JSON(pydantic_model=Response))
def classify(request: Request) -> Response:
input_ = [
request.sepal_width,
request.sepal_length,
request.petal_width,
request.petal_length,
]
label_index = iris_clf_runner.predict.run([input_])[0]
label = ["setosa", "versicolor", "virginica"][label_index]
return Response(label=label)
================================================
FILE: mini_tutorials/custom_optimizer_in_pytorch/custom.py
================================================
import numpy as np
import torch
from torch.optim import Optimizer
class WeirdDescent(Optimizer):
"""Take a coordinate descent step for a random parameter.
And also, make every 100th step way bigger.
"""
def __init__(self, parameters, lr=1e-3):
defaults = {"lr": lr}
super().__init__(parameters, defaults)
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
if not self.state:
self.state["step"] = 1
else:
self.state["step"] += 1
c = 1
if self.state["step"] % 100 == 0:
c = 100
grad = None
while grad is None:
param_group = np.random.choice(self.param_groups)
tensor = np.random.choice(param_group["params"])
grad = tensor.grad.data
element_ix = np.random.randint(tensor.numel())
mask_flat = torch.zeros(tensor.numel())
mask_flat[element_ix] = 1
mask = mask_flat.reshape(tensor.shape)
tensor.data.add_(grad * mask, alpha=-param_group["lr"] * c)
return loss
================================================
FILE: mini_tutorials/custom_optimizer_in_pytorch/src.py
================================================
from matplotlib.animation import FuncAnimation
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch.optim import Adam, SGD
from tqdm import tqdm
from custom import WeirdDescent
def rosenbrock(xy):
"""Evaluate Rosenbrock function.
Parameters
----------
xy : tuple
Two element tuple of floats representing the x resp. y coordinates.
Returns
-------
float
The Rosenbrock function evaluated at the point `xy`.
"""
x, y = xy
return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2
def run_optimization(xy_init, optimizer_class, n_iter, **optimizer_kwargs):
"""Run optimization finding the minimum of the Rosenbrock function.
Parameters
----------
xy_init : tuple
Two floats representing the x resp. y coordinates.
optimizer_class : object
Optimizer class.
n_iter : int
Number of iterations to run the optimization for.
optimizer_kwargs : dict
Additional parameters to be passed into the optimizer.
Returns
-------
path : np.ndarray
2D array of shape `(n_iter + 1, 2)`. Where the rows represent the
iteration and the columns represent the x resp. y coordinates.
"""
xy_t = torch.tensor(xy_init, requires_grad=True)
optimizer = optimizer_class([xy_t], **optimizer_kwargs)
path = np.empty((n_iter + 1, 2))
path[0, :] = xy_init
for i in tqdm(range(1, n_iter + 1)):
optimizer.zero_grad()
loss = rosenbrock(xy_t)
loss.backward()
torch.nn.utils.clip_grad_norm_(xy_t, 1.0)
optimizer.step()
path[i, :] = xy_t.detach().numpy()
return path
def create_animation(paths,
colors,
names,
figsize=(12, 12),
x_lim=(-2, 2),
y_lim=(-1, 3),
n_seconds=5):
"""Create an animation.
Parameters
----------
paths : list
List of arrays representing the paths (history of x,y coordinates) the
optimizer went through.
colors : list
List of strings representing colors for each path.
names : list
List of strings representing names for each path.
figsize : tuple
Size of the figure.
x_lim, y_lim : tuple
Range of the x resp. y axis.
n_seconds : int
Number of seconds the animation should last.
Returns
-------
anim : FuncAnimation
Animation of the paths of all the optimizers.
"""
if not (len(paths) == len(colors) == len(names)):
raise ValueError
path_length = max(len(path) for path in paths)
n_points = 300
x = np.linspace(*x_lim, n_points)
y = np.linspace(*y_lim, n_points)
X, Y = np.meshgrid(x, y)
Z = rosenbrock([X, Y])
minimum = (1.0, 1.0)
fig, ax = plt.subplots(figsize=figsize)
ax.contour(X, Y, Z, 90, cmap="jet")
scatters = [ax.scatter(None,
None,
label=label,
c=c) for c, label in zip(colors, names)]
ax.legend(prop={"size": 25})
ax.plot(*minimum, "rD")
def animate(i):
for path, scatter in zip(paths, scatters):
scatter.set_offsets(path[:i, :])
ax.set_title(str(i))
ms_per_frame = 1000 * n_seconds / path_length
anim = FuncAnimation(fig, animate, frames=path_length, interval=ms_per_frame)
return anim
if __name__ == "__main__":
xy_init = (.3, .8)
n_iter = 1500
path_adam = run_optimization(xy_init, Adam, n_iter)
path_sgd = run_optimization(xy_init, SGD, n_iter, lr=1e-3)
path_weird = run_optimization(xy_init, WeirdDescent, n_iter, lr=1e-3)
freq = 10
paths = [path_adam[::freq], path_sgd[::freq], path_weird[::freq]]
colors = ["green", "blue", "black"]
names = ["Adam", "SGD", "Weird"]
anim = create_animation(paths,
colors,
names,
figsize=(12, 7),
x_lim=(-.1, 1.1),
y_lim=(-.1, 1.1),
n_seconds=7)
anim.save("result.gif")
print(path_weird[-15:])
================================================
FILE: mini_tutorials/deploying_on_kubernetes/Dockerfile
================================================
FROM huggingface/transformers-pytorch-gpu
RUN python3 -c "from transformers import AutoModel;AutoModel.from_pretrained('bert-base-uncased')"
RUN python3 -c "from transformers import AutoTokenizer;AutoTokenizer.from_pretrained('bert-base-uncased')"
RUN pip install fastapi uvicorn
EXPOSE 8888
ENTRYPOINT ["transformers-cli", "serve", "--port=8888", "--host=0.0.0.0", "--task=fill-mask", "--model=bert-base-uncased"]
================================================
FILE: mini_tutorials/deploying_on_kubernetes/DockerfileConda
================================================
FROM continuumio/miniconda3
RUN conda install -c conda-forge pytorch-cpu
RUN conda install -c conda-forge fastapi
RUN conda install -c conda-forge uvicorn
RUN conda install -c huggingface transformers
RUN conda install -c conda-forge huggingface_hub=0.2.1
RUN python3 -c "from transformers import AutoModel;AutoModel.from_pretrained('bert-base-uncased')"
RUN python3 -c "from transformers import AutoTokenizer;AutoTokenizer.from_pretrained('bert-base-uncased')"
EXPOSE 8888
ENTRYPOINT ["transformers-cli", "serve", "--port=8888", "--host=0.0.0.0", "--task=fill-mask", "--model=bert-base-uncased"]
================================================
FILE: mini_tutorials/deploying_on_kubernetes/README.md
================================================
# Relevant commands
## Creating an API
```bash
transformers-cli serve --task=fill-mask --model=bert-base-uncased
```
```bash
curl http://localhost:8888 | jq
```
```bash
curl -X POST http://localhost:8888/forward -H "accept: application/json" -H "Content-Type: application/json" -d '{"inputs": "Today is going to be a [MASK] day"}' | jq
```
## Containerization
Build first image.
```bash
docker build -t cool-api:v1 .
```
Build second image.
```bash
docker build -t cool-api:v2 -f DockerfileConda .
```
Run image.
```bash
docker run -it --rm -P cool-api:v2
```
## Deploying on Kubernetes
Start a minikube cluster.
```bash
minikube start
```
Get all objects across all namespaces.
```bash
kubectl get all -A
```
List images.
```bash
minikube image list
```
Load an image.
```bash
minikube image cool-api:v2
```
Create a deployment.
```bash
kubectl create deploy cool-deploy --image=cool-api:v2
```
Create a service.
```bash
kubectl expose deploy/cool-deploy --name=cool-service --target-port=8888 --port=1234
```
Scale up.
```bash
kubectl scale deploy/cool-deploy --replicas=3
```
Get logs.
```bash
kubectl logs -f PODFULLNAME
```
================================================
FILE: mini_tutorials/embedding/README.md
================================================
# Training data
The Dracula book can be found here: https://archive.org/stream/draculabr00stokuoft/draculabr00stokuoft_djvu.txt
================================================
FILE: mini_tutorials/embedding/Visualize.ipynb
================================================
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "incredible-backup",
"metadata": {},
"outputs": [],
"source": [
"import ipywidgets\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "proud-accreditation",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"res.csv\")\n",
"last_epoch = df[\"epoch\"].max()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "canadian-nightlife",
"metadata": {},
"outputs": [],
"source": [
"@ipywidgets.interact\n",
"def f(epoch=ipywidgets.IntSlider(min=0, max=last_epoch , continuous_update=False)):\n",
" fig, ax = plt.subplots(1, 1, figsize=(12, 8))\n",
" ax.set_xlim([-2, 2])\n",
" ax.set_ylim([-2, 2])\n",
" df_iter = df[df[\"epoch\"] == epoch]\n",
" df_iter.plot(kind='scatter', x='dim_0',y='dim_1', ax=ax, c=\"red\")\n",
" df_iter[['dim_0','dim_1','character']].apply(lambda row:\n",
" ax.text(row[\"dim_0\"] + 0.02,\n",
" row[\"dim_1\"] + 0.01,\n",
" row[\"character\"],\n",
" fontsize=18),\n",
" axis=1)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "early-vinyl",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
================================================
FILE: mini_tutorials/embedding/src.py
================================================
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
import torch
from torch.nn import Embedding, Linear, LSTM, Module
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from tqdm import tqdm
class CharacterDataset(Dataset):
"""Custom dataset.
Parameters
----------
text : str
Input text that will be used to create the entire database.
window_size : int
Number of characters to use as input features.
vocab_size : int
Number of characters in the vocabulary. Note that the last character
is always reserved for a special "~" out-of-vocabulary character.
Attributes
----------
ch2ix : defaultdict
Mapping from the character to the position of that character in the
vocabulary. Note that all characters that are not in the vocabulary
will get mapped into the index `vocab_size - 1`.
ix2ch : dict
Mapping from the character position in the vocabulary to the actual
character.
vocabulary : list
List of all characters. `len(vocabulary) == vocab_size`.
"""
def __init__(self, text, window_size=1, vocab_size=50):
self.text = text.replace("\n", " ")
self.window_size = window_size
self.ch2ix = defaultdict(lambda: vocab_size - 1)
most_common_ch2ix = {
x[0]: i
for i, x in enumerate(Counter(self.text).most_common()[: (vocab_size - 1)])
}
self.ch2ix.update(most_common_ch2ix)
self.ch2ix["~"] = vocab_size - 1
self.ix2ch = {v: k for k, v in self.ch2ix.items()}
self.vocabulary = [self.ix2ch[i] for i in range(vocab_size)]
def __len__(self):
return len(self.text) - self.window_size
def __getitem__(self, ix):
X = torch.LongTensor(
[self.ch2ix[c] for c in self.text[ix : ix + self.window_size]]
)
y = self.ch2ix[self.text[ix + self.window_size]]
return X, y
class Network(Module):
"""Custom network predicting the next character of a string.
Parameters
----------
vocab_size : int
The number of characters in the vocabulary.
embedding_dim : int
Dimension of the character embedding vectors.
dense_dim : int
Number of neurons in the linear layer that follows the LSTM.
hidden_dim : int
Size of the LSTM hidden state.
max_norm : int
If any of the embedding vectors has a higher L2 norm than `max_norm`
it is rescaled.
n_layers : int
Number of the layers of the LSTM.
"""
def __init__(
self,
vocab_size,
embedding_dim=2,
dense_dim=32,
hidden_dim=8,
max_norm=2,
n_layers=1,
):
super().__init__()
self.embedding = Embedding(
vocab_size,
embedding_dim,
padding_idx=vocab_size - 1,
norm_type=2,
max_norm=max_norm,
)
self.lstm = LSTM(
embedding_dim, hidden_dim, batch_first=True, num_layers=n_layers
)
self.linear_1 = Linear(hidden_dim, dense_dim)
self.linear_2 = Linear(dense_dim, vocab_size)
def forward(self, x, h=None, c=None):
"""Run the forward pass.
Parameters
----------
x : torch.Tensor
Input tensor of shape `(n_samples, window_size)` of dtype
`torch.int64`.
h, c : torch.Tensor or None
Hidden states of the LSTM.
Returns
-------
logits : torch.Tensor
Tensor of shape `(n_samples, vocab_size)`.
h, c : torch.Tensor or None
Hidden states of the LSTM.
"""
emb = self.embedding(x) # (n_samples, window_size, embedding_dim)
if h is not None and c is not None:
_, (h, c) = self.lstm(emb, (h, c))
else:
_, (h, c) = self.lstm(emb) # (n_layers, n_samples, hidden_dim)
h_mean = h.mean(dim=0) # (n_samples, hidden_dim)
x = self.linear_1(h_mean) # (n_samples, dense_dim)
logits = self.linear_2(x) # (n_samples, vocab_size)
return logits, h, c
def compute_loss(cal, net, dataloader):
"""Computer average loss over a dataset."""
net.eval()
all_losses = []
for X_batch, y_batch in dataloader:
probs, _, _ = net(X_batch)
all_losses.append(cal(probs, y_batch).item())
return np.mean(all_losses)
def generate_text(n_chars, net, dataset, initial_text="Hello", random_state=None):
"""Generate text with the character-level model.
Parameters
----------
n_chars : int
Number of characters to generate.
net : Module
Character-level model.
dataset : CharacterDataset
Instance of the `CharacterDataset`.
initial_text : str
The starting text to be used as the initial condition for the model.
random_state : None or int
If not None, then the result is reproducible.
Returns
-------
res : str
Generated text.
"""
if not initial_text:
raise ValueError("You need to specify the initial text")
res = initial_text
net.eval()
h, c = None, None
if random_state is not None:
np.random.seed(random_state)
for _ in range(n_chars):
previous_chars = initial_text if res == initial_text else res[-1]
features = torch.LongTensor([[dataset.ch2ix[c] for c in previous_chars]])
logits, h, c = net(features, h, c)
probs = F.softmax(logits[0], dim=0).detach().numpy()
new_ch = np.random.choice(dataset.vocabulary, p=probs)
res += new_ch
return res
if __name__ == "__main__":
with open("text.txt", "r") as f:
text = "\n".join(f.readlines())
# Hyperparameters model
vocab_size = 70
window_size = 10
embedding_dim = 2
hidden_dim = 16
dense_dim = 32
n_layers = 1
max_norm = 2
# Training config
n_epochs = 25
train_val_split = 0.8
batch_size = 128
random_state = 13
torch.manual_seed(random_state)
loss_f = torch.nn.CrossEntropyLoss()
dataset = CharacterDataset(text, window_size=window_size, vocab_size=vocab_size)
n_samples = len(dataset)
split_ix = int(n_samples * train_val_split)
train_indices, val_indices = np.arange(split_ix), np.arange(split_ix, n_samples)
train_dataloader = DataLoader(
dataset, sampler=SubsetRandomSampler(train_indices), batch_size=batch_size
)
val_dataloader = DataLoader(
dataset, sampler=SubsetRandomSampler(val_indices), batch_size=batch_size
)
net = Network(
vocab_size,
hidden_dim=hidden_dim,
n_layers=n_layers,
dense_dim=dense_dim,
embedding_dim=embedding_dim,
max_norm=max_norm,
)
optimizer = torch.optim.Adam(
net.parameters(),
lr=1e-2,
)
emb_history = []
for e in range(n_epochs + 1):
net.train()
for X_batch, y_batch in tqdm(train_dataloader):
if e == 0:
break
optimizer.zero_grad()
probs, _, _ = net(X_batch)
loss = loss_f(probs, y_batch)
loss.backward()
optimizer.step()
train_loss = compute_loss(loss_f, net, train_dataloader)
val_loss = compute_loss(loss_f, net, val_dataloader)
print(f"Epoch: {e}, {train_loss=:.3f}, {val_loss=:.3f}")
# Generate one sentence
initial_text = "I hope it works "
generated_text = generate_text(
100, net, dataset, initial_text=initial_text, random_state=random_state
)
print(generated_text)
# Prepare DataFrame
weights = net.embedding.weight.detach().clone().numpy()
df = pd.DataFrame(weights, columns=[f"dim_{i}" for i in range(embedding_dim)])
df["epoch"] = e
df["character"] = dataset.vocabulary
emb_history.append(df)
final_df = pd.concat(emb_history)
final_df.to_csv("res.csv", index=False)
================================================
FILE: mini_tutorials/fewshot_text_classification/classify.py
================================================
import pathlib
import jinja2
import openai
path = pathlib.Path("template.jinja2")
with path.open() as f:
prompt_template = jinja2.Template(f.read())
labels = [
{"label": 0, "description": "negative sentiment"},
{"label": 1, "description": "neutral sentiment"},
{"label": 2, "description": "positive sentiment"},
]
examples = [
{"text": "Today was a horrible day", "label": 0},
{"text": "Yesterday was a great day", "label": 2},
]
text = "I loved the TV show"
prompt = prompt_template.render(
examples=examples,
labels=labels,
text=text,
)
print(prompt)
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
)
print(completion.choices[0].message)
================================================
FILE: mini_tutorials/fewshot_text_classification/template.jinja2
================================================
I want you to classify text for me.
See below all the possible labels and their description
{% for item in labels %}
"""
description: {{ item.description }}
label: {{ item.label }}
"""
{% endfor %}
{% if examples %}
See below a couple of examples
{% for item in examples %}
"""
text: {{ item.text }}
label: {{ item.label }}
"""
{% endfor %}
{% endif %}
Here is the text that needs to be classified
"""
text: {{ text }}
label:
================================================
FILE: mini_tutorials/gradient_wrt_input/explain.py
================================================
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision.models as models
from utils import compute_gradient, read_image, scale_grad, to_array
def func(inp, net=None, target=None):
"""Get logit of a target class.
Parameters
----------
inp : torch.Tensor
Input image (single image batch).
net : torch.nn.Module
Classifier network.
target : int
Imagenet ground truth label id.
Returns
-------
logit : torch.Tensor
Logit of the `target` class.
"""
out = net(inp)
logit = out[0, target]
return logit
def compute_integrated_gradients(inp, baseline, net, target, n_steps=100):
"""Compute integrated gradients.
Parameters
----------
inp : torch.Tensor
Input image (single image batch) of shape `(1, 3, *, *)`.
baseline : torch.Tensor
Basline image of the same shape as the `inp`.
net : torch.nn.Module
Classifier network.
target : int
Imagenet ground truth label id.
n_steps : int
Number of steps between the `inp` and `baseline` tensors.
Returns
-------
ig : torch.Tensor
Integrated gradients with the same shape as the `inp`.
inp_grad : torch.Tensor
Gradient with respect to the `inp` tensor. Same shape as `inp`.
"""
path = [baseline + a * (inp - baseline) for a in np.linspace(0, 1, n_steps)]
grads = [compute_gradient(func, x, net=net, target=target) for x in path]
ig = (inp - baseline) * torch.cat(grads[:-1]).mean(dim=0, keepdims=True)
return ig, grads[-1]
if __name__ == "__main__":
net = models.resnet18(pretrained=True)
net.eval()
tensor = read_image("img.jpg")
arr = to_array(tensor)
n_steps = 100
baseline = -1.5 * torch.ones_like(tensor)
ig, inp_grad = compute_integrated_gradients(
tensor, baseline, net, 291, n_steps=n_steps
)
ig_scaled = scale_grad(ig)
inp_grad_scaled = scale_grad(inp_grad)
_, (ax_baseline, ax_img, ax_inp_grad, ax_ig) = plt.subplots(1, 4, figsize=(19.20,10.80))
ax_baseline.imshow(to_array(baseline))
ax_img.imshow(arr)
ax_inp_grad.imshow(arr * inp_grad_scaled)
ax_ig.imshow(arr * ig_scaled)
ax_baseline.set_title("Baseline")
ax_img.set_title("Input")
ax_inp_grad.set_title("Gradient input")
ax_ig.set_title("Integrated gradients")
ax_baseline.axis("off")
ax_img.axis("off")
ax_inp_grad.axis("off")
ax_ig.axis("off")
plt.savefig("res_2.png")
================================================
FILE: mini_tutorials/gradient_wrt_input/fool.py
================================================
import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision.models as models
from utils import compute_gradient, read_image, to_array
def func(inp, net=None, target=None):
"""Compute negative log likelihood.
Parameters
----------
inp : torch.Tensor
Input image (single image batch).
net : torch.nn.Module
Classifier network.
target : int
Imagenet ground truth label id.
Returns
-------
loss : torch.Tensor
Loss for the `inp` image.
"""
out = net(inp)
loss = torch.nn.functional.nll_loss(out, target=torch.LongTensor([target]))
print(f"Loss: {loss.item()}")
return loss
def attack(tensor, net, eps=1e-3, n_iter=50):
"""Run the Fast Sign Gradient Method (FSGM) attack.
Parameters
----------
tensor : torch.Tensor
The input image of shape `(1, 3, 224, 224)`.
net : torch.nn.Module
Classifier network.
eps : float
Determines how much we modify the image in a single iteration.
n_iter : int
Number of iterations.
Returns
-------
new_tensor : torch.Tensor
New image that is a modification of the input image that "fools"
the classifier.
"""
new_tensor = tensor.detach().clone()
orig_prediction = net(tensor).argmax()
print(f"Original prediction: {orig_prediction.item()}")
for i in range(n_iter):
net.zero_grad()
grad = compute_gradient(
func, new_tensor, net=net, target=orig_prediction.item()
)
new_tensor = torch.clamp(new_tensor + eps * grad.sign(), -2, 2)
new_prediction = net(new_tensor).argmax()
if orig_prediction != new_prediction:
print(f"We fooled the network after {i} iterations!")
print(f"New prediction: {new_prediction.item()}")
break
return new_tensor, orig_prediction.item(), new_prediction.item()
if __name__ == "__main__":
net = models.resnet18(pretrained=True)
net.eval()
tensor = read_image("img.jpg")
new_tensor, orig_prediction, new_prediction = attack(
tensor, net, eps=1e-3, n_iter=100
)
_, (ax_orig, ax_new, ax_diff) = plt.subplots(1, 3, figsize=(19.20,10.80))
arr = to_array(tensor)
new_arr = to_array(new_tensor)
diff_arr = np.abs(arr - new_arr).mean(axis=-1)
diff_arr = diff_arr / diff_arr.max()
ax_orig.imshow(arr)
ax_new.imshow(new_arr)
ax_diff.imshow(diff_arr, cmap="gray")
ax_orig.axis("off")
ax_new.axis("off")
ax_diff.axis("off")
ax_orig.set_title(f"Original: {orig_prediction}")
ax_new.set_title(f"Modified: {new_prediction}")
ax_diff.set_title("Difference")
plt.savefig("res_1.png")
================================================
FILE: mini_tutorials/gradient_wrt_input/utils.py
================================================
from PIL import Image
import torch
from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
ToTensor)
def compute_gradient(func, inp, **kwargs):
"""Compute the gradient with respect to `inp`.
Parameters
----------
func : callable
Function that takes in `inp` and `kwargs` and returns a single element
tensor.
inp : torch.Tensor
The tensor that we want to get the gradients for. Needs to be a leaf
node.
**kwargs : dict
Additional keyword arguments passed into `func`.
Returns
-------
grad : torch.Tensor
Tensor of the same shape as `inp` that is representing the gradient.
"""
inp.requires_grad = True
loss = func(inp, **kwargs)
loss.backward()
inp.requires_grad = False
return inp.grad.data
def read_image(path):
"""Load image from disk and convert to torch.Tensor.
Parameters
----------
path : str
Path to the image.
Returns
-------
tensor : torch.Tensor
Single sample batch containing our image (ready to be used with
pretrained networks). The shape is `(1, 3, 224, 224)`.
"""
img = Image.open(path)
transform = Compose([Resize(256),
CenterCrop(224),
ToTensor(),
Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])])
tensor_ = transform(img)
tensor = tensor_.unsqueeze(0)
return tensor
def to_array(tensor):
"""Convert torch.Tensor to np.ndarray.
Parameters
----------
tensor : torch.Tensor
Tensor of shape `(1, 3, *, *)` representing one sample batch of images.
Returns
-------
arr : np.ndarray
Array of shape `(*, *, 3)` representing an image that can be plotted
directly.
"""
tensor_ = tensor.squeeze()
unnormalize_transform = Compose([Normalize(mean=[0, 0, 0],
std=[1 / 0.229, 1 / 0.224, 1 / 0.225]),
Normalize(mean=[-0.485, -0.456, -0.406],
std=[1, 1, 1])])
arr_ = unnormalize_transform(tensor_)
arr = arr_.permute(1, 2, 0).detach().numpy()
return arr
def scale_grad(grad):
"""Scale gradient tensor.
Parameters
----------
grad : torch.Tensor
Gradient of shape `(1, 3, *, *)`.
Returns
-------
grad_arr : np.ndarray
Array of shape `(*, *, 1)`.
"""
grad_arr = torch.abs(grad).mean(dim=1).detach().permute(1, 2, 0)
grad_arr /= grad_arr.quantile(0.98)
grad_arr = torch.clamp(grad_arr, 0, 1)
return grad_arr.numpy()
================================================
FILE: mini_tutorials/haiku_basics/buffers_in_torch.py
================================================
import torch
bn = torch.nn.BatchNorm1d(5)
bn.state_dict()
for name, p in bn.named_buffers():
print(name, p, p.requires_grad)
for name, p in bn.named_parameters():
print(name, p, p.requires_grad)
================================================
FILE: mini_tutorials/haiku_basics/parameter.py
================================================
from __future__ import annotations
import haiku as hk
import jax
import jax.numpy as jnp
def foo(x: jnp.ndarray) -> jnp.ndarray:
c = hk.get_parameter("c", x.shape, init=hk.initializers.RandomNormal(1))
res = c + x
key = hk.next_rng_key()
mask = jax.random.bernoulli(key, 0.5, x.shape)
return res * mask * 2
foo_transformed = hk.transform(foo)
init_key = jax.random.PRNGKey(24)
apply_key_seq = hk.PRNGSequence(init_key)
x = jnp.ones((2, 5))
params = foo_transformed.init(init_key, x)
for _ in range(2):
res = foo_transformed.apply(params, next(apply_key_seq), x)
print(res)
================================================
FILE: mini_tutorials/haiku_basics/reallife.py
================================================
from __future__ import annotations
import haiku as hk
import jax
import jax.numpy as jnp
def foo(x: jnp.ndarray) -> jnp.ndarray:
mlp = hk.nets.MLP([4, 5, 1])
loss = mlp(x).mean()
return loss
foo_transformed = hk.without_apply_rng(hk.transform(foo))
init_key = jax.random.PRNGKey(3452)
x = jnp.ones((2, 3))
params = foo_transformed.init(init_key, x)
grad_foo = jax.jit(jax.grad(foo_transformed.apply))
grads = grad_foo(params, x)
================================================
FILE: mini_tutorials/haiku_basics/requirements.txt
================================================
-e git+ssh://git@github.com/deepmind/dm-haiku.git@386efc098fd52a5cf728e7d13442138ab25eb235#egg=dm_haiku
jax==0.3.5
jaxlib==0.3.5
================================================
FILE: mini_tutorials/haiku_basics/state.py
================================================
from __future__ import annotations
import haiku as hk
import jax
import jax.numpy as jnp
def foo(x: jnp.ndarray) -> jnp.ndarray:
c = hk.get_parameter("c", x.shape, init=hk.initializers.RandomNormal(1))
counter = hk.get_state(
"counter", shape=[], dtype=jnp.int32, init=jnp.ones
)
hk.set_state("counter", counter + 1)
res = c + x + counter
return res
foo_transformed = hk.transform_with_state(foo)
init_key = jax.random.PRNGKey(32)
x = jnp.ones((2, 5))
params, state = foo_transformed.init(init_key, x)
for i in range(2):
print(f"After {i} iterations")
res, state = foo_transformed.apply(params, state, None, x)
print(state)
print(res)
================================================
FILE: mini_tutorials/httpx_rate_limiting/script.py
================================================
import asyncio
import logging
import httpx
logger = logging.getLogger()
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.basicConfig(format="%(asctime)s %(name)s %(message)s", level=logging.INFO)
async def send_request(client: httpx.AsyncClient, semaphore: asyncio.Semaphore) -> int:
url = "https://pokeapi.co/api/v2/pokemon/ditto"
async with semaphore:
logger.info("Sending request")
response = await client.get(url)
logger.info("Response received")
return response.status_code
async def main() -> int:
semaphore = asyncio.Semaphore(5)
async with httpx.AsyncClient() as client:
tasks = [asyncio.create_task(send_request(client, semaphore)) for _ in range(10)]
status_codes = await asyncio.gather(*tasks)
logger.info("All work done")
return 0 if all(c == 200 for c in status_codes) else 1
if __name__ == "__main__":
raise SystemExit(asyncio.run(main()))
================================================
FILE: mini_tutorials/mocking_neural_networks/app.py
================================================
import logging
import sys
import numpy as np
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
def get_top_k(sequence, tokenizer, model, k=10):
"""Get the top k most probable tokens to fill the gap with.
Parameters
----------
sequence : str
String containing the [MASK] token.
tokenizer : BertFastTokenizer
Tokenizer.
model : BertForMaskedLM
Model.
k : int
Number of the top results to return.
Returns
-------
top_vocab_indices : torch.Tensor
1D tensor representing the indices of the top tokens.
"""
batch_enc = tokenizer(sequence, return_tensors="pt")
mask_ix = torch.where(batch_enc["input_ids"] == tokenizer.mask_token_id)[1]
logits = model(**batch_enc).logits
top_vocab_indices = torch.topk(logits[0, mask_ix.item(), :], k)[1]
return top_vocab_indices
if __name__ == "__main__":
logging.disable(logging.WARNING)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
sequence = sys.argv[1]
top_indices = get_top_k(sequence, tokenizer, model, 5)
top_tokens = [tokenizer.decode(torch.tensor([ix])) for ix in top_indices]
winner = top_tokens[0]
print(np.random.permutation(top_tokens))
guess = input("Who do you think is the winner? ").strip()
if guess == winner:
print("You won!!!")
else:
print("You lost!!!")
print("\nTrue ranking")
for i, x in enumerate(top_tokens):
print(i, x)
================================================
FILE: mini_tutorials/mocking_neural_networks/test.py
================================================
from unittest.mock import Mock
import pytest
import torch
from transformers import (AutoTokenizer, AutoModelForMaskedLM, BatchEncoding,
BertForMaskedLM, BertTokenizerFast)
from app import get_top_k
@pytest.mark.parametrize("k", [5, 7])
def test_with_real_objects(k):
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
sequence = "Hello [MASK]"
res = get_top_k(sequence, tokenizer, model, k)
assert isinstance(res, torch.Tensor)
assert res.shape == (k,)
@pytest.mark.parametrize("k", [5, 7])
def test_with_mock_objects(k):
sequence = "Hello [MASK]"
vocab_size = 1000
data = {"input_ids": torch.tensor([[101, 555, 103, 102]])}
be = BatchEncoding(data=data)
logits = torch.rand(1, 4, vocab_size)
tokenizer_m = Mock(spec=BertTokenizerFast,
return_value=be,
mask_token_id=103)
model_m = Mock(spec=BertForMaskedLM)
model_m.return_value.logits = logits
res = get_top_k(sequence,
tokenizer_m,
model_m,
k=k)
assert isinstance(res, torch.Tensor)
assert res.shape == (k,)
================================================
FILE: mini_tutorials/numpy_equality_testing/test.py
================================================
import numpy as np
import pytest
def get_arrays():
"""Create 4 arrays that are all similar but different.
Returns
-------
a : np.ndarray
Reference array.
a_eps : np.ndarray
Same shape as `a`, however, the values are slightly different.
a_dim : np.ndarray
One extra dimension compared to `a`, however, the values are the same.
a_nan : np.ndarray
Same shape and same values, however, one entry is set to `np.nan`.
"""
eps = 1e-5
a = np.array([[1.2, 5.12, 2.4], [5.5, 8.8, 1.55]])
a_eps = a + eps
a_dim = a[None, :] # shape (1, 2, 3)
a_nan = a.copy()
a_nan[0, 1] = np.nan
return a, a_eps, a_dim, a_nan
def test___eq__():
a, *_ = get_arrays()
with pytest.raises(ValueError):
assert a == a
def test___eq__all():
a, a_eps, a_dim, a_nan = get_arrays()
assert (a == a).all()
assert not (a == a_eps).all()
assert (a == a_dim).all()
assert not (a_nan == a_nan).all()
def test_array_equal():
a, a_eps, a_dim, a_nan = get_arrays()
assert np.array_equal(a, a)
assert not np.array_equal(a, a_eps)
assert not np.array_equal(a, a_dim)
assert not np.array_equal(a_nan, a_nan)
assert np.array_equal(a_nan, a_nan, equal_nan=True)
def test_allclose():
a, a_eps, a_dim, a_nan = get_arrays()
atol = 1e-5
assert np.allclose(a, a, atol=atol)
assert np.allclose(a, a_eps, atol=atol)
assert np.allclose(a, a_dim, atol=atol)
assert not np.allclose(a_nan, a_nan, atol=atol)
assert np.allclose(a_nan, a_nan, atol=atol, equal_nan=True)
def test_testing_array_equal():
a, a_eps, a_dim, a_nan = get_arrays()
np.testing.assert_array_equal(a, a)
# np.testing.assert_array_equal(a, a_eps)
# np.testing.assert_array_equal(a, a_dim)
np.testing.assert_array_equal(a_nan, a_nan)
def test_testing_allclose():
a, a_eps, a_dim, a_nan = get_arrays()
atol = 1e-5
np.testing.assert_allclose(a, a, atol=atol)
np.testing.assert_allclose(a, a_eps, atol=atol)
# np.testing.assert_allclose(a, a_dim, atol=atol)
np.testing.assert_allclose(a_nan, a_nan, atol=atol)
# np.testing.assert_allclose(a_nan, a_nan, atol=atol, equal_nan=False)
================================================
FILE: mini_tutorials/openai_function_calling/example.py
================================================
import json
import logging
import operator
import sys
import datetime
import openai
import yfinance as yf
TODAY = datetime.date.today().strftime("%Y/%m/%d")
logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def get_price(symbol: str, date: str) -> float:
logger.info(f"Calling get_price with {symbol=} and {date=}")
history = yf.download(
symbol, start=date, period="1d", interval="1d", progress=False
)
return history["Close"].iloc[0].item()
def calculate(a: float, b: float, op: str) -> float:
logger.info(f"Calling calculate with {a=}, {b=} and {op=}")
return getattr(operator, op)(a, b)
get_price_metadata = {
"name": "get_price",
"description": "Get closing price of a financial instrument on a given date",
"parameters": {
"type": "object",
"properties": {
"symbol": {
"type": "string",
"description": "Ticker symbol of a financial instrument",
},
"date": {
"type": "string",
"description": "Date in the format YYYY-MM-DD",
},
},
"required": ["symbol", "date"],
},
}
calculate_metadata = {
"name": "calculate",
"description": "General purpose calculator",
"parameters": {
"type": "object",
"properties": {
"a": {
"type": "number",
"description": "First entry",
},
"b": {
"type": "number",
"description": "Second entry",
},
"op": {
"type": "string",
"enum": ["mul", "add", "truediv", "sub"],
"description": "Binary operation",
},
},
"required": ["a", "b", "op"],
},
}
messages = [
{"role": "user", "content": sys.argv[1]},
{
"role": "system",
"content": "You are a helpful financial investor who overlooks the "
f"performance of stocks. Today is {TODAY}. Note that the "
"format of the date is YYYY/MM/DD",
},
]
while True:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo-0613",
temperature=0,
messages=messages,
functions=[get_price_metadata, calculate_metadata],
)
message = response["choices"][0]["message"]
messages.append(message)
if "function_call" not in message:
break
# call custom functions
function_name = message["function_call"]["name"]
kwargs = json.loads(message["function_call"]["arguments"])
if function_name == "get_price":
output = str(get_price(**kwargs))
elif function_name == "calculate":
output = str(calculate(**kwargs))
else:
raise ValueError
messages.append({"role": "function", "name": function_name, "content": output})
print("*" * 80)
print([m["role"] for m in messages])
print("*" * 80)
print(messages[-1]["content"])
================================================
FILE: mini_tutorials/rag_with_reranking/README.md
================================================
# Description
## Installation
Run the following command to deploy a simple OpenSearch DB locally.
```bash
docker run -p 9200:9200 -p 9600:9600 -e "DISABLE_SECURITY_PLUGIN=true" -e "discovery.type=single-node" --name opensearch-node -d opensearchproject/opensearch:latest
```
The version of the image was `2.10.0` at the time of making the video.
To install the Python dependencies run
```bash
pip install opensearch-py cohere
```
Again, I did not hardcode any version, but the versions at the time of
making the video were
```bash
cohere==4.27
opensearch-py==2.3.1
```
## Contents
* `answer.py` - scripts that does RAG question answering - requires question as the only argument
* `input.txt` - each line corresponds to a document to be added to OpenSearch(except for emtpy lines and comments)
* `upload_data.py` - load `input.txt` into OpenSearch
Note that to use the `answer.py` you need to get a Cohere API token and
then export
```bash
export COHERE_API_KEY=VERYSECRET
python answer.py 'What is the meaning of life?'
```
## Postman
You can import the `postman_collection.json` in Postman and then
simply add the following 3 variables in your environment
* `OpenSearchURL` - will be `http://localhost:9200` if you follow the above instructions
* `CohereURL` - should be `https://api.cohere.ai/v1`
* `CohereAPIKey` - you need to generate this yourself
# Diagrams
## RAG with embeddings
## RAG with reranking
================================================
FILE: mini_tutorials/rag_with_reranking/answer.py
================================================
import os
import sys
import cohere
from opensearchpy import OpenSearch
# Helper
def generate_prompt(question: str, contexts: str):
prompt = (
"Given the following extracted parts of a long document and a "
'question, create a final answer with references ("SOURCES").'
"If you don't know the answer, just say that you don't know, don't try "
'to make up an answer. ALWAYS return a "SOURCES" part in your answer.\n'
)
prompt += f"QUESTION: {question}\n"
prompt += "".join(
[f"SOURCE {i}: {context}\n" for i, context in enumerate(contexts)]
)
prompt += "ANSWER: "
return prompt
# PARAMETERS
INDEX_NAME = "cool_index"
FIELD_NAME = "stuff"
RETRIEVER_K = 5
RERANKER_K = 2
COHERE_API_KEY = os.environ["COHERE_API_KEY"]
question = sys.argv[1]
# Instantiate clients
os_client = OpenSearch(
hosts=[
{
"host": "localhost",
"port": 9200,
}
]
)
cohere_client = cohere.Client(COHERE_API_KEY)
# Retrieve
os_results = os_client.search(
body={
"query": {
"match": {
FIELD_NAME: question
}
}
},
size=RETRIEVER_K
)
contexts = [x["_source"][FIELD_NAME] for x in os_results["hits"]["hits"]]
print("OpenSearch: ", contexts)
# Rerank
cohere_results = cohere_client.rerank(
model="rerank-english-v2.0",
query=question,
documents=contexts,
top_n=RERANKER_K,
)
reranked_contexts = [r.document["text"] for r in cohere_results]
print("Cohere Reranked: ", reranked_contexts)
# Chat completion
prompt = generate_prompt(question, reranked_contexts)
response = cohere_client.chat(
chat_history=[],
message=prompt
)
print("Answer: ", response.text)
================================================
FILE: mini_tutorials/rag_with_reranking/input.txt
================================================
# AGE AND FAVOURITE FOOD - 'What is the favourite food of Charles?', 'Who prefers vegetables the most?'
Adam is older than Ben
Ben is older then Charles
Adam eats a lot of carrots
Ben's favourite food is an apple
Charles loves KFC
Whatever, this sentence does not really contain anything super important
# SPORTING EVENTS - 'What country managed to become world football champion after 2050'?
Brazil won the Fifa World Cup in 2070
France is pretty good at football and won many championships
Finland has won many ice hockey world cups
Jamaica won the Athletics World Cup in 2055
Mexico won the Golf World Cup in 2050
================================================
FILE: mini_tutorials/rag_with_reranking/postman_collection.json
================================================
{
"info": {
"name": "Retrieval augmented generation",
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
},
"item": [
{
"name": "OpenSearch",
"item": [
{
"name": "Get all indices",
"request": {
"method": "GET",
"header": [],
"url": {
"raw": "{{OpenSearchURL}}/_cat/indices?v=true&s=index",
"host": [
"{{OpenSearchURL}}"
],
"path": [
"_cat",
"indices"
],
"query": [
{
"key": "v",
"value": "true"
},
{
"key": "s",
"value": "index"
}
]
}
},
"response": []
},
{
"name": "Create index",
"request": {
"method": "PUT",
"header": [],
"body": {
"mode": "raw",
"raw": "{\n \"settings\": {\n \"index\": {\n \"number_of_shards\": 1,\n \"number_of_replicas\": 1\n }\n },\n \"mappings\": {\n \"properties\": {\n \"stuff\": {\n \"type\": \"text\"\n }\n }\n }\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{OpenSearchURL}}/cool_index",
"host": [
"{{OpenSearchURL}}"
],
"path": [
"cool_index"
]
}
},
"response": []
},
{
"name": "Delete index",
"request": {
"method": "DELETE",
"header": [],
"body": {
"mode": "raw",
"raw": "",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{OpenSearchURL}}/cool_index",
"host": [
"{{OpenSearchURL}}"
],
"path": [
"cool_index"
]
}
},
"response": []
},
{
"name": "Add document",
"request": {
"method": "POST",
"header": [],
"body": {
"mode": "raw",
"raw": "{\n \"stuff\": \"This is just some document\"\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{OpenSearchURL}}/cool_index/_doc",
"host": [
"{{OpenSearchURL}}"
],
"path": [
"cool_index",
"_doc"
]
}
},
"response": []
},
{
"name": "List all documents",
"request": {
"method": "POST",
"header": [],
"body": {
"mode": "raw",
"raw": "{\n \"query\": {\n \"match_all\": {}\n }\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{OpenSearchURL}}/cool_index/_search",
"host": [
"{{OpenSearchURL}}"
],
"path": [
"cool_index",
"_search"
]
}
},
"response": []
},
{
"name": "Lexical (BM 25) search",
"request": {
"method": "POST",
"header": [],
"body": {
"mode": "raw",
"raw": "{\n \"query\": {\n \"match\": {\n \"stuff\": \"Some document\"\n }\n }\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{OpenSearchURL}}/cool_index/_search",
"host": [
"{{OpenSearchURL}}"
],
"path": [
"cool_index",
"_search"
]
}
},
"response": []
}
]
},
{
"name": "Cohere",
"item": [
{
"name": "Embed",
"request": {
"method": "POST",
"header": [],
"body": {
"mode": "raw",
"raw": "{\n \"texts\": [\n \"hello\",\n \"goodbye\"\n ],\n \"truncate\": \"END\"\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{CohereURL}}/embed",
"host": [
"{{CohereURL}}"
],
"path": [
"embed"
]
},
"description": "[https://docs.cohere.com/reference/embed](https://docs.cohere.com/reference/embed)"
},
"response": []
},
{
"name": "Rerank",
"request": {
"method": "POST",
"header": [],
"body": {
"mode": "raw",
"raw": "{\n \"return_documents\": false,\n \"max_chunks_per_doc\": 10,\n \"query\": \"What is the capital of the United States?\",\n \"documents\": [\n \"Carson City is the capital city of the American state of Nevada.\",\n \"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.\",\n \"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.\",\n \"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states.\"\n ]\n}",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{CohereURL}}/rerank",
"host": [
"{{CohereURL}}"
],
"path": [
"rerank"
]
},
"description": "[https://docs.cohere.com/reference/embed](https://docs.cohere.com/reference/embed)"
},
"response": []
},
{
"name": "Chat",
"request": {
"method": "POST",
"header": [],
"body": {
"mode": "raw",
"raw": " {\n \"chat_history\": [\n {\"role\": \"USER\", \"message\": \"Who discovered gravity?\"},\n {\"role\": \"CHATBOT\", \"message\": \"The man who is widely credited with discovering gravity is Sir Isaac Newton\"}\n ],\n \"message\": \"What year was he born?\"\n }",
"options": {
"raw": {
"language": "json"
}
}
},
"url": {
"raw": "{{CohereURL}}/chat",
"host": [
"{{CohereURL}}"
],
"path": [
"chat"
]
},
"description": ""
},
"response": []
}
],
"auth": {
"type": "bearer",
"bearer": [
{
"key": "token",
"value": "{{CohereAPIKey}}",
"type": "string"
}
]
},
"event": [
{
"listen": "prerequest",
"script": {
"type": "text/javascript",
"exec": [
""
]
}
},
{
"listen": "test",
"script": {
"type": "text/javascript",
"exec": [
""
]
}
}
]
}
]
}
================================================
FILE: mini_tutorials/rag_with_reranking/upload_data.py
================================================
from pathlib import Path
from opensearchpy import OpenSearch
INPUT_FILE = "input.txt"
INDEX_NAME = "cool_index"
FIELD_NAME = "stuff"
client = OpenSearch(
hosts=[
{
"host": "localhost",
"port": 9200,
}
]
)
print(client.ping())
with Path(INPUT_FILE).open() as f:
i = 0
for line in f.read().splitlines():
if not line or line.startswith("#"):
continue
print(f"Adding {i}")
client.index(index=INDEX_NAME, body={FIELD_NAME: line})
i += 1
================================================
FILE: mini_tutorials/visualizing_activations_with_forward_hooks/src.py
================================================
import pathlib
import torch
import torch.nn.functional as F
from torch.nn import Linear, Module
from torch.utils.tensorboard import SummaryWriter
class Network(Module):
def __init__(self):
super().__init__()
self.fc_1 = Linear(10, 20)
self.fc_2 = Linear(20, 30)
self.fc_3 = Linear(30, 2)
def forward(self, x):
x = self.fc_1(x)
x = self.fc_2(x)
x = self.fc_3(x)
x = F.relu(x)
return x
if __name__ == "__main__":
log_dir = pathlib.Path.cwd() / "tensorboard_logs"
writer = SummaryWriter(log_dir)
x = torch.rand(1, 10)
net = Network()
def activation_hook(inst, inp, out):
"""Run activation hook.
Parameters
----------
inst : torch.nn.Module
The layer we want to attach the hook to.
inp : tuple of torch.Tensor
The input to the `forward` method.
out : torch.Tensor
The output of the `forward` method.
"""
print("Here")
writer.add_histogram(repr(inst), out)
handle_1 = net.fc_1.register_forward_hook(activation_hook)
net.fc_2.register_forward_hook(activation_hook)
net.fc_3.register_forward_hook(activation_hook)
y = net(x)
handle_1.remove()
y = net(x)