Repository: jankrepl/mildlyoverfitted Branch: master Commit: 22f0ecc67cef Files: 118 Total size: 314.6 KB Directory structure: gitextract_ixgqmhua/ ├── .gitignore ├── LICENSE ├── README.md ├── github_adventures/ │ ├── automata/ │ │ ├── model.py │ │ └── train.py │ ├── diffaugment/ │ │ ├── README.MD │ │ ├── script.py │ │ └── utils.py │ ├── dino/ │ │ ├── data/ │ │ │ ├── README.md │ │ │ └── imagenette_labels.json │ │ ├── evaluation.py │ │ ├── train.py │ │ ├── utils.py │ │ ├── visualize_attentions.ipynb │ │ └── visualize_augmentations.ipynb │ ├── gpt/ │ │ ├── README.md │ │ ├── copy_and_generate.py │ │ ├── distribution_visualizations.ipynb │ │ ├── ipython_code.py │ │ ├── model.py │ │ ├── requirements.txt │ │ └── utils.py │ ├── integer/ │ │ ├── README.md │ │ ├── bert.py │ │ ├── experiments.sh │ │ ├── fetch_data.py │ │ ├── glove.py │ │ ├── lstm.py │ │ ├── requirements.txt │ │ └── utils.py │ ├── lottery/ │ │ ├── README.md │ │ ├── data.py │ │ ├── main.py │ │ ├── parallel_launch.sh │ │ ├── requirements.txt │ │ └── utils.py │ ├── mixer/ │ │ ├── README.md │ │ ├── official.py │ │ ├── ours.py │ │ └── test_compare.py │ ├── mixup/ │ │ ├── launch_experiments.sh │ │ ├── train.py │ │ └── utils.py │ ├── ner_evaluation/ │ │ ├── README.md │ │ ├── ours.py │ │ ├── test_ours.py │ │ └── try.py │ ├── neuron/ │ │ ├── README.md │ │ ├── evaluate_noise.py │ │ ├── evaluate_shuffling.py │ │ ├── evaluate_video.py │ │ ├── launch.sh │ │ ├── pretrained/ │ │ │ ├── MLP.pkl │ │ │ ├── MLP_augment.pkl │ │ │ ├── invariant_official.pkl │ │ │ ├── invariant_ours.pkl │ │ │ ├── linear.pkl │ │ │ └── linear_augment.pkl │ │ ├── requirements.txt │ │ ├── solutions.py │ │ ├── tasks.py │ │ ├── torch_utils.py │ │ └── trainer.py │ ├── pondernet/ │ │ ├── experiment_1.sh │ │ ├── experiment_2.sh │ │ ├── requirements.txt │ │ ├── train.py │ │ └── utils.py │ ├── product_quantization/ │ │ ├── README.md │ │ ├── convert.py │ │ ├── custom.py │ │ ├── faiss_101_ipython.py │ │ ├── generate_index.py │ │ ├── parse.py │ │ ├── requirements.txt │ │ ├── run_all.sh │ │ └── run_gradio.py │ ├── siren/ │ │ ├── activations.py │ │ ├── core.py │ │ └── train.py │ └── vision_transformer/ │ ├── classes.txt │ ├── custom.py │ ├── forward.py │ └── verify.py └── mini_tutorials/ ├── bentoml/ │ ├── README.md │ ├── bentofile.yaml │ ├── create_model.py │ ├── requirements.txt │ └── service.py ├── custom_optimizer_in_pytorch/ │ ├── custom.py │ └── src.py ├── deploying_on_kubernetes/ │ ├── Dockerfile │ ├── DockerfileConda │ └── README.md ├── embedding/ │ ├── README.md │ ├── Visualize.ipynb │ └── src.py ├── fewshot_text_classification/ │ ├── classify.py │ └── template.jinja2 ├── gradient_wrt_input/ │ ├── explain.py │ ├── fool.py │ └── utils.py ├── haiku_basics/ │ ├── buffers_in_torch.py │ ├── parameter.py │ ├── reallife.py │ ├── requirements.txt │ └── state.py ├── httpx_rate_limiting/ │ └── script.py ├── mocking_neural_networks/ │ ├── app.py │ └── test.py ├── numpy_equality_testing/ │ └── test.py ├── openai_function_calling/ │ └── example.py ├── rag_with_reranking/ │ ├── README.md │ ├── answer.py │ ├── input.txt │ ├── postman_collection.json │ └── upload_data.py └── visualizing_activations_with_forward_hooks/ └── src.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 Jan Krepl Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # mildlyoverfitted Code for https://www.youtube.com/c/mildlyoverfitted. ### Overview | Name | Video | Code | |--------------------------------------------------------------------------------|--------------------------------------|----------------------------------------------------------------------------------------------------------------------------| | Asynchronous requests and rate limiting | [link](https://youtu.be/luWsr9exlE4) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/httpx_rate_limiting) | | BentoML Sagemaker deployment | [link](https://youtu.be/Zci_D4az9FU) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/bentoml) | | Custom optimizer in PyTorch | [link](https://youtu.be/zvp8K4iX2Cs) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/custom_optimizer_in_pytorch) | | Deploying machine learning models on Kubernetes | [link](https://youtu.be/DQRNt8Diyw4) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/deploying_on_kubernetes) | | Differentiable augmentation for GANs (using Kornia) | [link](https://youtu.be/J97EM3Clyys) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/diffaugment) | | DINO in PyTorch | [link](https://youtu.be/psmMEWKk4Uk) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/dino) | | Few-shot text classification with prompts | [link](https://youtu.be/AhqgDXcBU2M) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/fewshot_text_classification) | | GPT in PyTorch | [link](https://youtu.be/d7IRM40VMYM) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/gpt) | | Gradient with respect to input in PyTorch (FGSM attack + Integrated Gradients) | [link](https://youtu.be/5lFiZTSsp40) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/gradient_wrt_input) | | Growing neural cellular automata in PyTorch | [link](https://youtu.be/21ACbWoF2Oo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/automata) | | Haiku basics | [link](https://youtu.be/yXCKS-ZoYTY) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/haiku_basics) | | Integer embeddings in PyTorch | [link](https://youtu.be/bybuSBVzOdg) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/integer) | | Mixup in PyTorch | [link](https://youtu.be/hGAKHKqmXdY) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/mixup) | | MLP-Mixer in Flax and PyTorch | [link](https://youtu.be/HqytB2GUbHA) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/mixer) | | Mocking neural networks: unit testing in deep learning | [link](https://youtu.be/_KVV9jXSzvo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/mocking_neural_networks) | | NER model evaluation | [link](https://youtu.be/70YAUYP3hrw) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/ner_evaluation) | | NumPy equality testing | [link](https://youtu.be/sai1g5fjyb8) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/numpy_equality_testing) | | OpenAI function calling | [link](https://youtu.be/_B7F_6nTVEg) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/openai_function_calling) | | PonderNet in PyTorch | [link](https://youtu.be/JLFz1dU5HR4) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/pondernet) | | Product quantization in Faiss and from scratch | [link](https://youtu.be/PNVJvZEkuXo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/product_quantization) | | Retrieval augmented generation with OpenSearch and reranking | [link](https://youtu.be/OsE7YcDcPz0) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/rag_with_reranking) | | SIREN in PyTorch | [link](https://youtu.be/s4iFEoNlYhM) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/siren) | | The Lottery Ticket Hypothesis and pruning in PyTorch | [link](https://youtu.be/bQt0CLXXAqg) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/lottery) | | The Sensory Neuron as a Transformer in PyTorch | [link](https://youtu.be/mi_mzlhBGAU) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/neuron) | | `torch.nn.Embedding` explained (+ Character-level language model) | [link](https://youtu.be/euwN5DHfLEo) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/embedding) | | Vision Transformer in PyTorch | [link](https://youtu.be/ovB0ddFtzzA) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/github_adventures/vision_transformer) | | Visualizing activations with forward hooks (PyTorch) | [link](https://youtu.be/1ZbLA7ofasY) | [link](https://github.com/jankrepl/mildlyoverfitted/tree/master/mini_tutorials/visualizing_activations_with_forward_hooks) | ================================================ FILE: github_adventures/automata/model.py ================================================ import torch import torch.nn as nn class CAModel(nn.Module): """Cell automata model. Parameters ---------- n_channels : int Number of channels of the grid. hidden_channels : int Hidden channels that are related to the pixelwise 1x1 convolution. fire_rate : float Number between 0 and 1. The lower it is the more likely it is for cells to be set to zero during the `stochastic_update` process. device : torch.device Determines on what device we perfrom all the computations. Attributes ---------- update_module : nn.Sequential The only part of the network containing trainable parameters. Composed of 1x1 convolution, ReLu and 1x1 convolution. filters : torch.Tensor Constant tensor of shape `(3 * n_channels, 1, 3, 3)`. """ def __init__(self, n_channels=16, hidden_channels=128, fire_rate=0.5, device=None): super().__init__() self.fire_rate = 0.5 self.n_channels = n_channels self.device = device or torch.device("cpu") # Perceive step sobel_filter_ = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]]) scalar = 8.0 sobel_filter_x = sobel_filter_ / scalar sobel_filter_y = sobel_filter_.t() / scalar identity_filter = torch.tensor( [ [0, 0, 0], [0, 1, 0], [0, 0, 0], ], dtype=torch.float32, ) filters = torch.stack( [identity_filter, sobel_filter_x, sobel_filter_y] ) # (3, 3, 3) filters = filters.repeat((n_channels, 1, 1)) # (3 * n_channels, 3, 3) self.filters = filters[:, None, ...].to( self.device ) # (3 * n_channels, 1, 3, 3) # Update step self.update_module = nn.Sequential( nn.Conv2d( 3 * n_channels, hidden_channels, kernel_size=1, # (1, 1) ), nn.ReLU(), nn.Conv2d( hidden_channels, n_channels, kernel_size=1, bias=False, ), ) with torch.no_grad(): self.update_module[2].weight.zero_() self.to(self.device) def perceive(self, x): """Approximate channelwise gradient and combine with the input. This is the only place where we include information on the neighboring cells. However, we are not using any learnable parameters here. Parameters ---------- x : torch.Tensor Shape `(n_samples, n_channels, grid_size, grid_size)`. Returns ------- torch.Tensor Shape `(n_samples, 3 * n_channels, grid_size, grid_size)`. """ return nn.functional.conv2d(x, self.filters, padding=1, groups=self.n_channels) def update(self, x): """Perform update. Note that this is the only part of the forward pass that uses trainable parameters Paramters --------- x : torch.Tensor Shape `(n_samples, 3 * n_channels, grid_size, grid_size)`. Returns ------- torch.Tensor Shape `(n_samples, n_channels, grid_size, grid_size)`. """ return self.update_module(x) @staticmethod def stochastic_update(x, fire_rate): """Run pixel-wise dropout. Unlike dropout there is no scaling taking place. Parameters ---------- x : torch.Tensor Shape `(n_samples, n_channels, grid_size, grid_size)`. fire_rate : float Number between 0 and 1. The higher the more likely a given cell updates. Returns ------- torch.Tensor Shape `(n_samples, n_channels, grid_size, grid_size)`. """ device = x.device mask = (torch.rand(x[:, :1, :, :].shape) <= fire_rate).to(device, torch.float32) return x * mask # broadcasted over all channels @staticmethod def get_living_mask(x): """Identify living cells. Parameters ---------- x : torch.Tensor Shape `(n_samples, n_channels, grid_size, grid_size)`. Returns ------- torch.Tensor Shape `(n_samples, 1, grid_size, grid_size)` and the dtype is bool. """ return ( nn.functional.max_pool2d( x[:, 3:4, :, :], kernel_size=3, stride=1, padding=1 ) > 0.1 ) def forward(self, x): """Run the forward pass. Parameters ---------- x : torch.Tensor Shape `(n_samples, n_channels, grid_size, grid_size)`. Returns ------- torch.Tensor Shape `(n_sample, n_channels, grid_size, grid_size)`. """ pre_life_mask = self.get_living_mask(x) y = self.perceive(x) dx = self.update(y) dx = self.stochastic_update(dx, fire_rate=self.fire_rate) x = x + dx post_life_mask = self.get_living_mask(x) life_mask = (pre_life_mask & post_life_mask).to(torch.float32) return x * life_mask ================================================ FILE: github_adventures/automata/train.py ================================================ import argparse import pathlib import numpy as np import torch import torch.nn as nn from PIL import Image from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from model import CAModel def load_image(path, size=40): """Load an image. Parameters ---------- path : pathlib.Path Path to where the image is located. Note that the image needs to be RGBA. size : int The image will be resized to a square wit ha side length of `size`. Returns ------- torch.Tensor 4D float image of shape `(1, 4, size, size)`. The RGB channels are premultiplied by the alpha channel. """ img = Image.open(path) img = img.resize((size, size), Image.ANTIALIAS) img = np.float32(img) / 255.0 img[..., :3] *= img[..., 3:] return torch.from_numpy(img).permute(2, 0, 1)[None, ...] def to_rgb(img_rgba): """Convert RGBA image to RGB image. Parameters ---------- img_rgba : torch.Tensor 4D tensor of shape `(1, 4, size, size)` where the RGB channels were already multiplied by the alpha. Returns ------- img_rgb : torch.Tensor 4D tensor of shape `(1, 3, size, size)`. """ rgb, a = img_rgba[:, :3, ...], torch.clamp(img_rgba[:, 3:, ...], 0, 1) return torch.clamp(1.0 - a + rgb, 0, 1) def make_seed(size, n_channels): """Create a starting tensor for training. The only active pixels are going to be in the middle. Parameters ---------- size : int The height and the width of the tensor. n_channels : int Overall number of channels. Note that it needs to be higher than 4 since the first 4 channels represent RGBA. Returns ------- torch.Tensor 4D float tensor of shape `(1, n_chanels, size, size)`. """ x = torch.zeros((1, n_channels, size, size), dtype=torch.float32) x[:, 3:, size // 2, size // 2] = 1 return x def main(argv=None): parser = argparse.ArgumentParser( description="Training script for the Celluar Automata" ) parser.add_argument("img", type=str, help="Path to the image we want to reproduce") parser.add_argument( "-b", "--batch-size", type=int, default=8, help="Batch size. Samples will always be taken randomly from the pool." ) parser.add_argument( "-d", "--device", type=str, default="cpu", help="Device to use", choices=("cpu", "cuda"), ) parser.add_argument( "-e", "--eval-frequency", type=int, default=500, help="Evaluation frequency.", ) parser.add_argument( "-i", "--eval-iterations", type=int, default=300, help="Number of iterations when evaluating.", ) parser.add_argument( "-n", "--n-batches", type=int, default=5000, help="Number of batches to train for.", ) parser.add_argument( "-c", "--n-channels", type=int, default=16, help="Number of channels of the input tensor", ) parser.add_argument( "-l", "--logdir", type=str, default="logs", help="Folder where all the logs and outputs are saved.", ) parser.add_argument( "-p", "--padding", type=int, default=16, help="Padding. The shape after padding is (h + 2 * p, w + 2 * p).", ) parser.add_argument( "--pool-size", type=int, default=1024, help="Size of the training pool", ) parser.add_argument( "-s", "--size", type=int, default=40, help="Image size", ) # Parse arguments args = parser.parse_args() print(vars(args)) # Misc device = torch.device(args.device) log_path = pathlib.Path(args.logdir) log_path.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(log_path) # Target image target_img_ = load_image(args.img, size=args.size) p = args.padding target_img_ = nn.functional.pad(target_img_, (p, p, p, p), "constant", 0) target_img = target_img_.to(device) target_img = target_img.repeat(args.batch_size, 1, 1, 1) writer.add_image("ground truth", to_rgb(target_img_)[0]) # Model and optimizer model = CAModel(n_channels=args.n_channels, device=device) optimizer = torch.optim.Adam(model.parameters(), lr=2e-3) # Pool initialization seed = make_seed(args.size, args.n_channels).to(device) seed = nn.functional.pad(seed, (p, p, p, p), "constant", 0) pool = seed.clone().repeat(args.pool_size, 1, 1, 1) for it in tqdm(range(args.n_batches)): batch_ixs = np.random.choice( args.pool_size, args.batch_size, replace=False ).tolist() x = pool[batch_ixs] for i in range(np.random.randint(64, 96)): x = model(x) loss_batch = ((target_img - x[:, :4, ...]) ** 2).mean(dim=[1, 2, 3]) loss = loss_batch.mean() optimizer.zero_grad() loss.backward() optimizer.step() writer.add_scalar("train/loss", loss, it) argmax_batch = loss_batch.argmax().item() argmax_pool = batch_ixs[argmax_batch] remaining_batch = [i for i in range(args.batch_size) if i != argmax_batch] remaining_pool = [i for i in batch_ixs if i != argmax_pool] pool[argmax_pool] = seed.clone() pool[remaining_pool] = x[remaining_batch].detach() if it % args.eval_frequency == 0: x_eval = seed.clone() # (1, n_channels, size, size) eval_video = torch.empty(1, args.eval_iterations, 3, *x_eval.shape[2:]) for it_eval in range(args.eval_iterations): x_eval = model(x_eval) x_eval_out = to_rgb(x_eval[:, :4].detach().cpu()) eval_video[0, it_eval] = x_eval_out writer.add_video("eval", eval_video, it, fps=60) if __name__ == "__main__": main() ================================================ FILE: github_adventures/diffaugment/README.MD ================================================ # Data https://hanlab.mit.edu/projects/data-efficient-gans/datasets/100-shot-grumpy_cat.zip Just unzip it into `data/` and the code should work out of the box. ================================================ FILE: github_adventures/diffaugment/script.py ================================================ import argparse import pathlib import pprint from datetime import datetime import kornia.augmentation as K import torch import torchvision.transforms as transforms from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter from torchvision.utils import make_grid from tqdm import tqdm from utils import DatasetImages, Discriminator, Generator, init_weights_ def main(argv=None): # CLI parser = argparse.ArgumentParser() parser.add_argument("name", help="Name of the experiment") parser.add_argument( "-a", "--augment", action="store_true", help="If True, we apply augmentations", ) parser.add_argument( "-b", "--batch-size", type=int, default=16, help="Batch size" ) parser.add_argument( "--b1", type=float, default=0.5, help="Adam optimizer hyperparamter", ) parser.add_argument( "--b2", type=float, default=0.999, help="Adam optimizer hyperparamter", ) parser.add_argument( "-d", "--device", type=str, default="cpu", choices=["cpu", "cuda"], help="Device to use", ) parser.add_argument( "--eval-frequency", type=int, default=400, help="Generate generator images every `eval_frequency` epochs", ) parser.add_argument( "--latent-dim", type=int, default=100, help="Dimensionality of the random noise", ) parser.add_argument( "--lr", type=float, default=0.0002, help="Learning rate" ) parser.add_argument( "--ndf", type=int, default=32, help="Number of discriminator feature maps (after first convolution)", ) parser.add_argument( "--ngf", type=int, default=32, help="Number of generator feature maps (before last transposed convolution)", ) parser.add_argument( "-n", "--n-epochs", type=int, default=200, help="Number of training epochs", ) parser.add_argument( "--mosaic-size", type=int, default=10, help="Size of the side of the rectangular mosaic", ) parser.add_argument( "-p", "--prob", type=float, default=0.9, help="Probability of applying an augmentation", ) args = parser.parse_args(argv) args_d = vars(args) print(args) img_size = 128 # Additional parameters device = torch.device(args.device) mosaic_kwargs = {"nrow": args.mosaic_size, "normalize": True} n_mosaic_cells = args.mosaic_size * args.mosaic_size sample_showcase_ix = ( 0 # this one will be used to demonstrate the augmentations ) augment_module = torch.nn.Sequential( K.RandomAffine(degrees=0, translate=(1 / 8, 1 / 8), p=args.prob), K.RandomErasing((0.0, 0.5), p=args.prob), ) # Loss function adversarial_loss = torch.nn.BCELoss() # Initialize generator and discriminator generator = Generator(latent_dim=args.latent_dim, ngf=args.ngf) discriminator = Discriminator( ndf=args.ndf, augment_module=augment_module if args.augment else None ) generator.to(device) discriminator.to(device) # Initialize weights generator.apply(init_weights_) discriminator.apply(init_weights_) # Configure data loader data_path = pathlib.Path("data") tform = transforms.Compose( [ transforms.Resize(img_size), transforms.ToTensor(), transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]), ] ) dataset = DatasetImages( data_path, transform=tform, ) dataloader = DataLoader( dataset, batch_size=args.batch_size, shuffle=True, ) # Optimizers optimizer_G = torch.optim.Adam( generator.parameters(), lr=args.lr, betas=(args.b1, args.b2) ) optimizer_D = torch.optim.Adam( discriminator.parameters(), lr=args.lr, betas=(args.b1, args.b2) ) # Output path and metadata output_path = pathlib.Path("outputs") / args.name output_path.mkdir(exist_ok=True, parents=True) # Add other parameters (not included in CLI) args_d["time"] = datetime.now() args_d["kornia"] = str(augment_module) # Prepare tensorboard writer writer = SummaryWriter(output_path) # Log hyperparameters as text writer.add_text( "hyperparameter", pprint.pformat(args_d).replace( "\n", " \n" ), # markdown needs 2 spaces before newline 0, ) # Log true data writer.add_image( "true_data", make_grid( torch.stack([dataset[i] for i in range(n_mosaic_cells)]), **mosaic_kwargs ), 0, ) # Log augmented data batch_showcase = dataset[sample_showcase_ix][None, ...].repeat( n_mosaic_cells, 1, 1, 1 ) batch_showcase_aug = discriminator.augment_module(batch_showcase) writer.add_image( "augmentations", make_grid(batch_showcase_aug, **mosaic_kwargs), 0 ) # Prepate evaluation noise z_eval = torch.randn(n_mosaic_cells, args.latent_dim).to(device) for epoch in tqdm(range(args.n_epochs)): for i, imgs in enumerate(dataloader): n_samples, *_ = imgs.shape batches_done = epoch * len(dataloader) + i # Adversarial ground truths valid = 0.9 * torch.ones( n_samples, 1, device=device, dtype=torch.float32 ) fake = torch.zeros(n_samples, 1, device=device, dtype=torch.float32) # D preparation optimizer_D.zero_grad() # D loss on reals real_imgs = imgs.to(device) d_x = discriminator(real_imgs) real_loss = adversarial_loss(d_x, valid) real_loss.backward() # D loss on fakes z = torch.randn(n_samples, args.latent_dim).to(device) gen_imgs = generator(z) d_g_z1 = discriminator(gen_imgs.detach()) fake_loss = adversarial_loss(d_g_z1, fake) fake_loss.backward() optimizer_D.step() # we called backward twice, the result is a sum # G preparation optimizer_G.zero_grad() # G loss d_g_z2 = discriminator(gen_imgs) g_loss = adversarial_loss(d_g_z2, valid) g_loss.backward() optimizer_G.step() # Logging if batches_done % 50 == 0: writer.add_scalar("d_x", d_x.mean().item(), batches_done) writer.add_scalar("d_g_z1", d_g_z1.mean().item(), batches_done) writer.add_scalar("d_g_z2", d_g_z2.mean().item(), batches_done) writer.add_scalar( "D_loss", (real_loss + fake_loss).item(), batches_done ) writer.add_scalar("G_loss", g_loss.item(), batches_done) if epoch % args.eval_frequency == 0 and i == 0: generator.eval() discriminator.eval() # Generate fake images gen_imgs_eval = generator(z_eval) # Generate nice mosaic writer.add_image( "fake", make_grid(gen_imgs_eval.data, **mosaic_kwargs), batches_done, ) # Save checkpoint (and potentially overwrite an existing one) torch.save(generator, output_path / "model.pt") # Make sure generator and discriminator in the training mode generator.train() discriminator.train() if __name__ == "__main__": main() ================================================ FILE: github_adventures/diffaugment/utils.py ================================================ import torch.nn as nn from PIL import Image from torch.utils.data import Dataset class DatasetImages(Dataset): """Dataset loading photos on the hard drive. Parameters ---------- path : pathlib.Path Path to the folder containing all the images. transform : None or callable The transform to be applied when yielding the image. Attributes ---------- all_paths : list List of all paths to the `.jpg` images. """ def __init__(self, path, transform=None): super().__init__() self.all_paths = sorted([p for p in path.iterdir() if p.suffix == ".jpg"]) self.transform = transform def __len__(self): """Compute length of the dataset.""" return len(self.all_paths) def __getitem__(self, ix): """Get a single item.""" img = Image.open(self.all_paths[ix]) if self.transform is not None: img = self.transform(img) return img class Generator(nn.Module): """Generator network. Parameters ---------- latent_dim : int The dimensionality of the input noise. ngf : int Number of generator filters. Note that the actual number of filters will be a multiple of this number and is going to be divided by two in each consecutive block of the network. Attributes ---------- main : torch.Sequential The actual network that is composed of `ConvTranspose2d`, `BatchNorm2d` and `ReLU` blocks. """ def __init__(self, latent_dim, ngf=64): super().__init__() self.main = nn.Sequential( nn.ConvTranspose2d(latent_dim, ngf * 16, 4, 1, 0, bias=False), nn.BatchNorm2d(ngf * 16), nn.ReLU(True), # (ngf * 16) x 4 x 4 nn.ConvTranspose2d(ngf * 16, ngf * 8, 4, 2, 1, bias=False), nn.BatchNorm2d(ngf * 8), nn.ReLU(True), # (ngf * 8) x 8 x 8 nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False), nn.BatchNorm2d(ngf * 4), nn.ReLU(True), # (ngf * 4) x 16 x 16 nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False), nn.BatchNorm2d(ngf * 2), nn.ReLU(True), # (ngf * 2) x 32 x 32 nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False), nn.BatchNorm2d(ngf), nn.ReLU(True), # ngf x 64 x 64 nn.ConvTranspose2d(ngf, 3, 4, 2, 1, bias=False), nn.Tanh(), # 3 x 128 x 128 ) def forward(self, x): """Run the forward pass. Parameters ---------- x : torch.Tensor Input noise of shape `(n_samples, latent_dim)`. Returns ------- torch.Tensor Generated images of shape `(n_samples, 3, 128, 128)`. """ x = x.reshape(*x.shape, 1, 1) # (n_samples, latent_dim, 1, 1) return self.main(x) class Discriminator(nn.Module): """Discriminator netowrk. Parameters ---------- ndf : int Number of discriminator filters. It represents the number of filters after the first convolution block. Each consecutive block will double the number. augment_module : nn.Module or None If provided it represents the Kornia module that performs differentiable augmentation of the images. Attributes ---------- augment_module : nn.Module If the input parameter `augment_module` provided then this is the same thing. If not, then this is just an identity mapping. """ def __init__(self, ndf=16, augment_module=None): super().__init__() self.main = nn.Sequential( # 3 x 128 x 128 nn.Conv2d(3, ndf, 4, stride=2, padding=1, bias=False), nn.LeakyReLU(0.2, inplace=True), # ndf x 64 x 64 nn.Conv2d(ndf, ndf * 2, 4, stride=2, padding=1, bias=False), nn.BatchNorm2d(ndf * 2), nn.LeakyReLU(0.2, inplace=True), # (ndf * 2) x 32 x 32 nn.Conv2d(ndf * 2, ndf * 4, 4, stride=2, padding=1, bias=False), nn.BatchNorm2d(ndf * 4), nn.LeakyReLU(0.2, inplace=True), # (ndf * 4) x 16 x 16 nn.Conv2d(ndf * 4, ndf * 8, 4, stride=2, padding=1, bias=False), nn.BatchNorm2d(ndf * 8), nn.LeakyReLU(0.2, inplace=True), # (ndf * 8) x 8 x 8 nn.Conv2d(ndf * 8, ndf * 16, 4, stride=2, padding=1, bias=False), nn.BatchNorm2d(ndf * 16), nn.LeakyReLU(0.2, inplace=True), # (ndf * 16) x 4 x 4 nn.Conv2d(ndf * 16, 1, 4, stride=1, padding=0, bias=False), nn.Sigmoid() # 1 x 1 x 1 ) if augment_module is not None: self.augment_module = augment_module else: self.augment_module = nn.Identity() def forward(self, x): """Run the forward pass. Parameters ---------- x : torch.Tensor Input images of shape `(n_samples, 3, 128, 128)`. Returns ------- torch.Tensor Classification outputs of shape `(n_samples, 1)`. """ if self.training: x = self.augment_module(x) x = self.main(x) # (n_samples, 1, 1, 1) x = x.reshape(len(x), -1) # (n_samples, 1) return x def init_weights_(module): """Initialize weights by sampling from a normal distribution. Note that this operation is modifying the weights in place. Parameters ---------- module : nn.Module Module with trainable weights. """ cls_name = module.__class__.__name__ if cls_name in {"Conv2d", "ConvTranspose2d"}: nn.init.normal_(module.weight.data, 0.0, 0.02) elif cls_name == "BatchNorm2d": nn.init.normal_(module.weight.data, 1.0, 0.02) nn.init.constant_(module.bias.data, 0.0) ================================================ FILE: github_adventures/dino/data/README.md ================================================ The `Imagenette` dataset was used. You can find it here: https://github.com/fastai/imagenette (320 px version). ================================================ FILE: github_adventures/dino/data/imagenette_labels.json ================================================ {"n01440764": "tench", "n02102040": "english_springer", "n02979186": "cassette_player", "n03000684": "chain_saw", "n03028079": "church", "n03394916": "french_horn", "n03417042": "garbage_truck", "n03425413": "gas_pump", "n03445777": "golf_ball", "n03888257": "parachute"} ================================================ FILE: github_adventures/dino/evaluation.py ================================================ import numpy as np import torch from sklearn.metrics import accuracy_score from sklearn.neighbors import KNeighborsClassifier def compute_knn(backbone, data_loader_train, data_loader_val): """Get CLS embeddings and use KNN classifier on them. We load all embeddings in memory and use sklearn. Should be doable. Parameters ---------- backbone : timm.models.vision_transformer.VisionTransformer Vision transformer whose head is just an identity mapping. data_loader_train, data_loader_val : torch.utils.data.DataLoader Training and validation dataloader that does not apply any augmentations. Just casting to tensor and then normalizing. Returns ------- val_accuracy : float Validation accuracy. """ device = next(backbone.parameters()).device data_loaders = { "train": data_loader_train, "val": data_loader_val, } lists = { "X_train": [], "y_train": [], "X_val": [], "y_val": [], } for name, data_loader in data_loaders.items(): for imgs, y in data_loader: imgs = imgs.to(device) lists[f"X_{name}"].append(backbone(imgs).detach().cpu().numpy()) lists[f"y_{name}"].append(y.detach().cpu().numpy()) arrays = {k: np.concatenate(l) for k, l in lists.items()} estimator = KNeighborsClassifier() estimator.fit(arrays["X_train"], arrays["y_train"]) y_val_pred = estimator.predict(arrays["X_val"]) acc = accuracy_score(arrays["y_val"], y_val_pred) return acc def compute_embedding(backbone, data_loader): """Compute CLS embedding and prepare for TensorBoard. Parameters ---------- backbone : timm.models.vision_transformer.VisionTransformer Vision transformer. The head should be an identity mapping. data_loader : torch.utils.data.DataLoader Validation dataloader that does not apply any augmentations. Just casting to tensor and then normalizing. Returns ------- embs : torch.Tensor Embeddings of shape `(n_samples, out_dim)`. imgs : torch.Tensor Images of shape `(n_samples, 3, height, width)`. labels : list List of strings representing the classes. """ device = next(backbone.parameters()).device embs_l = [] imgs_l = [] labels = [] for img, y in data_loader: img = img.to(device) embs_l.append(backbone(img).detach().cpu()) imgs_l.append(((img * 0.224) + 0.45).cpu()) # undo norm labels.extend([data_loader.dataset.classes[i] for i in y.tolist()]) embs = torch.cat(embs_l, dim=0) imgs = torch.cat(imgs_l, dim=0) return embs, imgs, labels ================================================ FILE: github_adventures/dino/train.py ================================================ import argparse import json import pathlib import timm import torch import torchvision.transforms as transforms import tqdm from torch.utils.data import DataLoader, SubsetRandomSampler from torch.utils.tensorboard import SummaryWriter from torchvision.datasets import ImageFolder from evaluation import compute_embedding, compute_knn from utils import DataAugmentation, Head, Loss, MultiCropWrapper, clip_gradients def main(): parser = argparse.ArgumentParser( "DINO training CLI", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("-b", "--batch-size", type=int, default=32) parser.add_argument( "-d", "--device", type=str, choices=("cpu", "cuda"), default="cpu" ) parser.add_argument("-l", "--logging-freq", type=int, default=200) parser.add_argument("--momentum-teacher", type=int, default=0.9995) parser.add_argument("-c", "--n-crops", type=int, default=4) parser.add_argument("-e", "--n-epochs", type=int, default=100) parser.add_argument("-o", "--out-dim", type=int, default=1024) parser.add_argument("-t", "--tensorboard-dir", type=str, default="logs") parser.add_argument("--clip-grad", type=float, default=2.0) parser.add_argument("--norm-last-layer", action="store_true") parser.add_argument("--batch-size-eval", type=int, default=64) parser.add_argument("--teacher-temp", type=float, default=0.04) parser.add_argument("--student-temp", type=float, default=0.1) parser.add_argument("--pretrained", action="store_true") parser.add_argument("-w", "--weight-decay", type=float, default=0.4) args = parser.parse_args() print(vars(args)) # Parameters vit_name, dim = "vit_deit_small_patch16_224", 384 path_dataset_train = pathlib.Path("data/imagenette2-320/train") path_dataset_val = pathlib.Path("data/imagenette2-320/val") path_labels = pathlib.Path("data/imagenette_labels.json") logging_path = pathlib.Path(args.tensorboard_dir) device = torch.device(args.device) n_workers = 4 # Data related with path_labels.open("r") as f: label_mapping = json.load(f) transform_aug = DataAugmentation(size=224, n_local_crops=args.n_crops - 2) transform_plain = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), transforms.Resize((224, 224)), ] ) dataset_train_aug = ImageFolder(path_dataset_train, transform=transform_aug) dataset_train_plain = ImageFolder(path_dataset_train, transform=transform_plain) dataset_val_plain = ImageFolder(path_dataset_val, transform=transform_plain) if dataset_train_plain.classes != dataset_val_plain.classes: raise ValueError("Inconsistent classes") data_loader_train_aug = DataLoader( dataset_train_aug, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=n_workers, pin_memory=True, ) data_loader_train_plain = DataLoader( dataset_train_plain, batch_size=args.batch_size_eval, drop_last=False, num_workers=n_workers, ) data_loader_val_plain = DataLoader( dataset_val_plain, batch_size=args.batch_size_eval, drop_last=False, num_workers=n_workers, ) data_loader_val_plain_subset = DataLoader( dataset_val_plain, batch_size=args.batch_size_eval, drop_last=False, sampler=SubsetRandomSampler(list(range(0, len(dataset_val_plain), 50))), num_workers=n_workers, ) # Logging writer = SummaryWriter(logging_path) writer.add_text("arguments", json.dumps(vars(args))) # Neural network related student_vit = timm.create_model(vit_name, pretrained=args.pretrained) teacher_vit = timm.create_model(vit_name, pretrained=args.pretrained) student = MultiCropWrapper( student_vit, Head( dim, args.out_dim, norm_last_layer=args.norm_last_layer, ), ) teacher = MultiCropWrapper(teacher_vit, Head(dim, args.out_dim)) student, teacher = student.to(device), teacher.to(device) teacher.load_state_dict(student.state_dict()) for p in teacher.parameters(): p.requires_grad = False # Loss related loss_inst = Loss( args.out_dim, teacher_temp=args.teacher_temp, student_temp=args.student_temp, ).to(device) lr = 0.0005 * args.batch_size / 256 optimizer = torch.optim.AdamW( student.parameters(), lr=lr, weight_decay=args.weight_decay, ) # Training loop n_batches = len(dataset_train_aug) // args.batch_size best_acc = 0 n_steps = 0 for e in range(args.n_epochs): for i, (images, _) in tqdm.tqdm( enumerate(data_loader_train_aug), total=n_batches ): if n_steps % args.logging_freq == 0: student.eval() # Embedding embs, imgs, labels_ = compute_embedding( student.backbone, data_loader_val_plain_subset, ) writer.add_embedding( embs, metadata=[label_mapping[l] for l in labels_], label_img=imgs, global_step=n_steps, tag="embeddings", ) # KNN current_acc = compute_knn( student.backbone, data_loader_train_plain, data_loader_val_plain, ) writer.add_scalar("knn-accuracy", current_acc, n_steps) if current_acc > best_acc: torch.save(student, logging_path / "best_model.pth") best_acc = current_acc student.train() images = [img.to(device) for img in images] teacher_output = teacher(images[:2]) student_output = student(images) loss = loss_inst(student_output, teacher_output) optimizer.zero_grad() loss.backward() clip_gradients(student, args.clip_grad) optimizer.step() with torch.no_grad(): for student_ps, teacher_ps in zip( student.parameters(), teacher.parameters() ): teacher_ps.data.mul_(args.momentum_teacher) teacher_ps.data.add_( (1 - args.momentum_teacher) * student_ps.detach().data ) writer.add_scalar("train_loss", loss, n_steps) n_steps += 1 if __name__ == "__main__": main() ================================================ FILE: github_adventures/dino/utils.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import torchvision.transforms as transforms from PIL import Image class DataAugmentation: """Create crops of an input image together with additional augmentation. It generates 2 global crops and `n_local_crops` local crops. Parameters ---------- global_crops_scale : tuple Range of sizes for the global crops. local_crops_scale : tuple Range of sizes for the local crops. n_local_crops : int Number of local crops to create. size : int The size of the final image. Attributes ---------- global_1, global_2 : transforms.Compose Two global transforms. local : transforms.Compose Local transform. Note that the augmentation is stochastic so one instance is enough and will lead to different crops. """ def __init__( self, global_crops_scale=(0.4, 1), local_crops_scale=(0.05, 0.4), n_local_crops=8, size=224, ): self.n_local_crops = n_local_crops RandomGaussianBlur = lambda p: transforms.RandomApply( # noqa [transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2))], p=p, ) flip_and_jitter = transforms.Compose( [ transforms.RandomHorizontalFlip(p=0.5), transforms.RandomApply( [ transforms.ColorJitter( brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1, ), ] ), transforms.RandomGrayscale(p=0.2), ] ) normalize = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ] ) self.global_1 = transforms.Compose( [ transforms.RandomResizedCrop( size, scale=global_crops_scale, interpolation=Image.BICUBIC, ), flip_and_jitter, RandomGaussianBlur(1.0), # always apply normalize, ], ) self.global_2 = transforms.Compose( [ transforms.RandomResizedCrop( size, scale=global_crops_scale, interpolation=Image.BICUBIC, ), flip_and_jitter, RandomGaussianBlur(0.1), transforms.RandomSolarize(170, p=0.2), normalize, ], ) self.local = transforms.Compose( [ transforms.RandomResizedCrop( size, scale=local_crops_scale, interpolation=Image.BICUBIC, ), flip_and_jitter, RandomGaussianBlur(0.5), normalize, ], ) def __call__(self, img): """Apply transformation. Parameters ---------- img : PIL.Image Input image. Returns ------- all_crops : list List of `torch.Tensor` representing different views of the input `img`. """ all_crops = [] all_crops.append(self.global_1(img)) all_crops.append(self.global_2(img)) all_crops.extend([self.local(img) for _ in range(self.n_local_crops)]) return all_crops class Head(nn.Module): """Network hooked up to the CLS token embedding. Just a MLP with the last layer being normalized in a particular way. Parameters ---------- in_dim : int The dimensionality of the token embedding. out_dim : int The dimensionality of the final layer (we compute the softmax over). hidden_dim : int Dimensionality of the hidden layers. bottleneck_dim : int Dimensionality of the second last layer. n_layers : int The number of layers. norm_last_layer : bool If True, then we freeze the norm of the weight of the last linear layer to 1. Attributes ---------- mlp : nn.Sequential Vanilla multi-layer perceptron. last_layer : nn.Linear Reparametrized linear layer with weight normalization. That means that that it will have `weight_g` and `weight_v` as learnable parameters instead of a single `weight`. """ def __init__( self, in_dim, out_dim, hidden_dim=512, bottleneck_dim=256, n_layers=3, norm_last_layer=False, ): super().__init__() if n_layers == 1: self.mlp = nn.Linear(in_dim, bottleneck_dim) else: layers = [nn.Linear(in_dim, hidden_dim)] layers.append(nn.GELU()) for _ in range(n_layers - 2): layers.append(nn.Linear(hidden_dim, hidden_dim)) layers.append(nn.GELU()) layers.append(nn.Linear(hidden_dim, bottleneck_dim)) self.mlp = nn.Sequential(*layers) self.apply(self._init_weights) self.last_layer = nn.utils.weight_norm( nn.Linear(bottleneck_dim, out_dim, bias=False) ) self.last_layer.weight_g.data.fill_(1) if norm_last_layer: self.last_layer.weight_g.requires_grad = False def _init_weights(self, m): """Initialize learnable parameters.""" if isinstance(m, nn.Linear): nn.init.normal_(m.weight, std=0.02) if m.bias is not None: nn.init.constant_(m.bias, 0) def forward(self, x): """Run forward pass. Parameters ---------- x : torch.Tensor Of shape `(n_samples, in_dim)`. Returns ------- torch.Tensor Of shape `(n_samples, out_dim)`. """ x = self.mlp(x) # (n_samples, bottleneck_dim) x = nn.functional.normalize(x, dim=-1, p=2) # (n_samples, bottleneck_dim) x = self.last_layer(x) # (n_samples, out_dim) return x class MultiCropWrapper(nn.Module): """Convenience class for forward pass of multiple crops. Parameters ---------- backbone : timm.models.vision_transformer.VisionTransformer Instantiated Vision Transformer. Note that we will take the `head` attribute and replace it with `nn.Identity`. new_head : Head New head that is going to be put on top of the `backbone`. """ def __init__(self, backbone, new_head): super().__init__() backbone.head = nn.Identity() # deactivate original head self.backbone = backbone self.new_head = new_head def forward(self, x): """Run the forward pass. The different crops are concatenated along the batch dimension and then a single forward pass is fun. The resulting tensor is then chunked back to per crop tensors. Parameters ---------- x : list List of `torch.Tensor` each of shape `(n_samples, 3, size, size)`. Returns ------- tuple Tuple of `torch.Tensor` each of shape `(n_samples, out_dim)` where `output_dim` is determined by `Head`. """ n_crops = len(x) concatenated = torch.cat(x, dim=0) # (n_samples * n_crops, 3, size, size) cls_embedding = self.backbone(concatenated) # (n_samples * n_crops, in_dim) logits = self.new_head(cls_embedding) # (n_samples * n_crops, out_dim) chunks = logits.chunk(n_crops) # n_crops * (n_samples, out_dim) return chunks class Loss(nn.Module): """The loss function. We subclass the `nn.Module` becuase we want to create a buffer for the logits center of the teacher. Parameters ---------- out_dim : int The dimensionality of the final layer (we computed the softmax over). teacher_temp, student_temp : float Softmax temperature of the teacher resp. student. center_momentum : float Hyperparameter for the exponential moving average that determines the center logits. The higher the more the running average matters. """ def __init__( self, out_dim, teacher_temp=0.04, student_temp=0.1, center_momentum=0.9 ): super().__init__() self.student_temp = student_temp self.teacher_temp = teacher_temp self.center_momentum = center_momentum self.register_buffer("center", torch.zeros(1, out_dim)) def forward(self, student_output, teacher_output): """Evaluate loss. Parameters ---------- student_output, teacher_output : tuple Tuple of tensors of shape `(n_samples, out_dim)` representing logits. The length is equal to number of crops. Note that student processed all crops and that the two initial crops are the global ones. Returns ------- loss : torch.Tensor Scalar representing the average loss. """ student_temp = [s / self.student_temp for s in student_output] teacher_temp = [(t - self.center) / self.teacher_temp for t in teacher_output] student_sm = [F.log_softmax(s, dim=-1) for s in student_temp] teacher_sm = [F.softmax(t, dim=-1).detach() for t in teacher_temp] total_loss = 0 n_loss_terms = 0 for t_ix, t in enumerate(teacher_sm): for s_ix, s in enumerate(student_sm): if t_ix == s_ix: continue loss = torch.sum(-t * s, dim=-1) # (n_samples,) total_loss += loss.mean() # scalar n_loss_terms += 1 total_loss /= n_loss_terms self.update_center(teacher_output) return total_loss @torch.no_grad() def update_center(self, teacher_output): """Update center used for teacher output. Compute the exponential moving average. Parameters ---------- teacher_output : tuple Tuple of tensors of shape `(n_samples, out_dim)` where each tensor represents a different crop. """ batch_center = torch.cat(teacher_output).mean( dim=0, keepdim=True ) # (1, out_dim) self.center = self.center * self.center_momentum + batch_center * ( 1 - self.center_momentum ) def clip_gradients(model, clip=2.0): """Rescale norm of computed gradients. Parameters ---------- model : nn.Module Module. clip : float Maximum norm. """ for p in model.parameters(): if p.grad is not None: param_norm = p.grad.data.norm(2) clip_coef = clip / (param_norm + 1e-6) if clip_coef < 1: p.grad.data.mul_(clip_coef) ================================================ FILE: github_adventures/dino/visualize_attentions.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "id": "1a3bd5ec", "metadata": {}, "outputs": [], "source": [ "import ipywidgets\n", "import matplotlib.pyplot as plt\n", "import timm\n", "import torch\n", "from torchvision.datasets import ImageFolder\n", "import torchvision.transforms as transforms\n", "from torchvision.utils import make_grid\n", "import torch.nn.functional as F" ] }, { "cell_type": "markdown", "id": "a6eaa0ef", "metadata": {}, "source": [ "# Helpers" ] }, { "cell_type": "code", "execution_count": null, "id": "2c0b2e7c", "metadata": {}, "outputs": [], "source": [ "def get_last_attention(backbone, x):\n", " \"\"\"Get the attention weights of CLS from the last self-attention layer.\n", "\n", " Very hacky!\n", "\n", " Parameters\n", " ----------\n", " backbone : timm.models.vision_transformer.VisionTransformer\n", " Instantiated Vision Transformer. Note that we will in-place\n", " take the `head` attribute and replace it with `nn.Identity`.\n", "\n", " x : torch.Tensor\n", " Batch of images of shape `(n_samples, 3, size, size)`.\n", "\n", " Returns\n", " -------\n", " torch.Tensor\n", " Attention weights `(n_samples, n_heads, n_patches)`.\n", " \"\"\"\n", " attn_module = backbone.blocks[-1].attn\n", " n_heads = attn_module.num_heads\n", "\n", " # define hook\n", " inp = None\n", " def fprehook(self, inputs):\n", " nonlocal inp\n", " inp = inputs[0]\n", "\n", " # Register a hook\n", " handle = attn_module.register_forward_pre_hook(fprehook)\n", "\n", " # Run forward pass\n", " _ = backbone(x)\n", " handle.remove()\n", "\n", " B, N, C = inp.shape\n", " qkv = attn_module.qkv(inp).reshape(B, N, 3, n_heads, C // n_heads).permute(2, 0, 3, 1, 4)\n", " q, k, v = qkv[0], qkv[1], qkv[2]\n", "\n", " attn = (q @ k.transpose(-2, -1)) * attn_module.scale\n", " attn = attn.softmax(dim=-1)\n", "\n", " return attn[:, :, 0, 1:]" ] }, { "cell_type": "code", "execution_count": null, "id": "57b72b84", "metadata": {}, "outputs": [], "source": [ "def threshold(attn, k=30):\n", " n_heads = len(attn)\n", " indices = attn.argsort(dim=1, descending=True)[:, k:]\n", "\n", " for head in range(n_heads):\n", " attn[head, indices[head]] = 0\n", "\n", " attn /= attn.sum(dim=1, keepdim=True)\n", "\n", " return attn" ] }, { "cell_type": "code", "execution_count": null, "id": "59e9009d", "metadata": {}, "outputs": [], "source": [ "def visualize_attention(img, backbone, k=30):\n", " \"\"\"Create attention image.\n", "\n", " Parameteres\n", " -----------\n", " img : PIL.Image\n", " RGB image.\n", "\n", " backbone : timm.models.vision_transformer.VisionTransformer\n", " The vision transformer.\n", "\n", " Returns\n", " -------\n", " new_img : torch.Tensor\n", " Image of shape (n_heads, 1, height, width).\n", " \"\"\"\n", " # imply parameters\n", "\n", " patch_size = backbone.patch_embed.proj.kernel_size[0]\n", "\n", " transform = transforms.Compose([\n", "\n", " transforms.Resize((224, 224)),\n", " transforms.ToTensor(),\n", " transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),\n", " ]\n", " )\n", "\n", " device = next(backbone.parameters()).device\n", " x = transform(img)[None, ...].to(device)\n", " attn = get_last_attention(backbone, x)[0] # (n_heads, n_patches)\n", " attn = attn / attn.sum(dim=1, keepdim=True) # (n_heads, n_patches)\n", " attn = threshold(attn, k)\n", " attn = attn.reshape(-1, 14, 14) # (n_heads, 14, 14)\n", " attn = F.interpolate(attn.unsqueeze(0),\n", " scale_factor=patch_size,\n", " mode=\"nearest\"\n", " )[0]\n", "\n", " return attn" ] }, { "cell_type": "markdown", "id": "df0972ec", "metadata": {}, "source": [ "# Preparation" ] }, { "cell_type": "code", "execution_count": null, "id": "d6e0d987", "metadata": {}, "outputs": [], "source": [ "models = {\n", " \"supervised\": timm.create_model(\"vit_deit_small_patch16_224\", pretrained=True),\n", " \"selfsupervised\": torch.load(\"best_model.pth\", map_location=\"cpu\").backbone,\n", "}\n", "dataset = ImageFolder(\"data/imagenette2-320/val\")\n", "\n", "colors = [\"yellow\", \"red\", \"green\", \"blue\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "690e3a1f", "metadata": { "scrolled": false }, "outputs": [], "source": [ "@ipywidgets.interact\n", "def _(\n", " i=ipywidgets.IntSlider(min=0, max=len(dataset) - 1, continuous_update=False),\n", " k=ipywidgets.IntSlider(min=0, max=195, value=10, continuous_update=False),\n", " model=ipywidgets.Dropdown(options=[\"supervised\", \"selfsupervised\"]),\n", "):\n", " img = dataset[i][0]\n", " attns = visualize_attention(img, models[model], k=k).detach()[:].permute(1, 2, 0).numpy()\n", "\n", " tform = transforms.Compose([\n", "\n", " transforms.Resize((224, 224)),\n", " ])\n", " # original image\n", " plt.imshow(tform(img))\n", " plt.axis(\"off\")\n", " plt.show()\n", "\n", " kwargs = {\"vmin\": 0, \"vmax\": 0.24}\n", " # Attentions\n", " n_heads = 6\n", "\n", " fig, axs = plt.subplots(2, 3, figsize=(10, 7))\n", " \n", " for i in range(n_heads):\n", " ax = axs[i // 3, i % 3]\n", " ax.imshow(attns[..., i], **kwargs)\n", " ax.axis(\"off\")\n", " \n", " plt.tight_layout()\n", " \n", " plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "d83eae10", "metadata": {}, "outputs": [], "source": [ "# 3244, 1942, 3482, 688, 1509, 3709" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: github_adventures/dino/visualize_augmentations.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "id": "5801191a", "metadata": {}, "outputs": [], "source": [ "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "import ipywidgets\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import torch\n", "from PIL import Image\n", "from torchvision.datasets import ImageFolder\n", "\n", "from utils import DataAugmentation" ] }, { "cell_type": "code", "execution_count": null, "id": "ad4f7f91", "metadata": {}, "outputs": [], "source": [ "def to_numpy(t):\n", " array = torch.clip((t * 0.224) + 0.45, 0, 1).permute(1, 2, 0).numpy()\n", " return array\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "db09874a", "metadata": {}, "outputs": [], "source": [ "transform = DataAugmentation(n_local_crops=2)\n", "dataset = ImageFolder(\"data/imagenette2-320/train/\", transform=transform)" ] }, { "cell_type": "code", "execution_count": null, "id": "48738037", "metadata": {}, "outputs": [], "source": [ "@ipywidgets.interact\n", "def _(\n", " i=ipywidgets.IntSlider(min=0, max=len(dataset) - 1, continuous_update=False),\n", " seed=ipywidgets.IntSlider(min=0, max=50, continuous_update=False),\n", "):\n", " torch.manual_seed(seed)\n", " all_crops, _ = dataset[i]\n", " titles = [\"Global 1\", \"Global 2\", \"Local 1\", \"Local 2\"]\n", " \n", " original_img = np.array(Image.open(dataset.samples[i][0]))\n", " _, ax_orig = plt.subplots(figsize=(15, 5))\n", " ax_orig.imshow(original_img)\n", " ax_orig.set_title(\"Original\")\n", " ax_orig.axis(\"off\")\n", " \n", " \n", " fig, axs = plt.subplots(2, 2, figsize=(10, 10))\n", " \n", " for i, title in enumerate(titles):\n", " ax = axs[i // 2, i % 2]\n", " ax.imshow(to_numpy(all_crops[i]))\n", " ax.set_title(title)\n", " ax.axis(\"off\")\n", " fig.tight_layout()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: github_adventures/gpt/README.md ================================================ # GPT-2 custom implementation ## Installation ```python pip install -r requirements.txt ``` ## Launching script To copy weights of an official model + generate some text use the script `copy_and_generate.py` ```python (gpt) gpt$ python copy_and_generate.py --help usage: Copy weights of a HF model and generate text. [-h] [--sample] [-s STEPS] [-r RANDOM_STATE] [-t TEMPERATURE] [-k TOP_K] [-v] {gpt2,gpt2-medium,gpt2-large,distilgpt2} initial_text positional arguments: {gpt2,gpt2-medium,gpt2-large,distilgpt2} Pretrained model to use initial_text Initial text optional arguments: -h, --help show this help message and exit --sample If True sample randomly otherwise take the most probable token (default: False) -s STEPS, --steps STEPS Number of new tokens to generate (default: 30) -r RANDOM_STATE, --random-state RANDOM_STATE Random state (default: None) -t TEMPERATURE, --temperature TEMPERATURE Softmax logits temperature (default: 1) -k TOP_K, --top-k TOP_K If specified, then selecting k most probable tokens (default: None) -v, --verbose If True, then verbose (default: False) ``` ================================================ FILE: github_adventures/gpt/copy_and_generate.py ================================================ import argparse import logging import torch from model import GPT from transformers import AutoModelForCausalLM, AutoTokenizer from utils import copy_model, generate_token logging.basicConfig(format="[%(levelname)s] %(asctime)s %(message)s") logger = logging.getLogger(__file__) def main(argv=None): """Copy weights and generate some text.""" parser = argparse.ArgumentParser( "Copy weights of a HF model and generate text.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "model_name", type=str, choices=("gpt2", "gpt2-medium", "gpt2-large", "distilgpt2"), help="Pretrained model to use", ) parser.add_argument( "initial_text", type=str, help="Initial text", ) parser.add_argument( "--sample", action="store_true", help="If True sample randomly otherwise take the most probable token", ) parser.add_argument( "-s", "--steps", default=30, type=int, help="Number of new tokens to generate", ) parser.add_argument("-r", "--random-state", type=int, help="Random state") parser.add_argument( "-t", "--temperature", default=1, type=float, help="Softmax logits temperature", ) parser.add_argument( "-k", "--top-k", type=int, help="If specified, then selecting k most probable tokens", ) parser.add_argument( "-v", "--verbose", action="store_true", help="If True, then verbose" ) args = parser.parse_args(argv) # Setup logging if args.verbose: logger.setLevel(logging.INFO) else: logger.setLevel(logging.WARNING) logger.info(f"CLI parameters: {vars(args)})") tokenizer = AutoTokenizer.from_pretrained(args.model_name) model_official = AutoModelForCausalLM.from_pretrained(args.model_name) config_official = model_official.config our_params = [ "vocab_size", "n_layer", "n_embd", "n_head", "n_positions", "attn_pdrop", "embd_pdrop", "resid_pdrop", "layer_norm_epsilon", ] config_ours = {k: getattr(config_official, k) for k in our_params} logger.info(f"Model hyperparameters: {config_ours}") model_ours = GPT(**config_ours) model_ours.eval() copy_model(model_official, model_ours) token_ixs = tokenizer(args.initial_text)["input_ids"] if args.random_state: torch.manual_seed(args.random_state) # Sample for step in range(args.steps): new_token_ix = generate_token( model_ours, token_ixs, sample=args.sample, top_k=args.top_k, temperature=args.temperature, ) token_ixs.append(new_token_ix) logger.info(f"Step {step} done") text = tokenizer.decode(token_ixs) print(text) if __name__ == "__main__": main() ================================================ FILE: github_adventures/gpt/distribution_visualizations.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "id": "896ffe86", "metadata": {}, "outputs": [], "source": [ "import ipywidgets\n", "\n", "import matplotlib.pyplot as plt\n", "import torch" ] }, { "cell_type": "markdown", "id": "09b6e1f4", "metadata": {}, "source": [ "#
Applying temperature + keeping only top K values
" ] }, { "cell_type": "markdown", "id": "2c7442cf", "metadata": {}, "source": [ "$T=\\mbox{temperature}$ $$\\large P_i=\\frac{e^{\\frac{y_i}T}}{\\sum_{k=1}^n e^{\\frac{y_k}T}}$$" ] }, { "cell_type": "code", "execution_count": null, "id": "95833de6", "metadata": {}, "outputs": [], "source": [ "@ipywidgets.interact\n", "def _(\n", " n_tokens=ipywidgets.IntSlider(min=4, max=30, value=8, continuous_update=False),\n", " random_state=ipywidgets.IntSlider(min=0, max=10, value=2, continuous_update=False),\n", " temperature=ipywidgets.FloatSlider(min=0, max=10, value=1, continuous_update=False),\n", " top_k=ipywidgets.IntSlider(min=1, max=20, value=8, continuous_update=False),\n", " ):\n", " # Preparations\n", " top_k = min(top_k, n_tokens)\n", " torch.manual_seed(random_state)\n", " logits = 10 * torch.rand(n_tokens,)\n", "\n", "\n", " # Generate original\n", " probs_orig = torch.nn.functional.softmax(logits, dim=0).numpy()\n", " \n", " # Generate new\n", " logits = logits / temperature\n", " top_values, _ = torch.topk(logits, top_k) # (top_k,) \n", " logits[logits < top_values.min()] = -torch.inf \n", " probs_new = torch.nn.functional.softmax(logits, dim=0).numpy()\n", "\n", " # Plotting\n", " fig, (ax_orig, ax_new) = plt.subplots(1, 2, sharey=True, figsize=(10, 2), dpi=100)\n", " x = range(n_tokens)\n", "\n", " ax_orig.bar(x, probs_orig)\n", " ax_orig.set_ylim((0, 1))\n", " ax_orig.set_title(\"Original\")\n", " \n", " ax_new.bar(x, probs_new)\n", " ax_new.set_title(\"Temperature + top K\")\n", " \n", " plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: github_adventures/gpt/ipython_code.py ================================================ >>> import torch >>> from model import GPT >>> from transformers import AutoModelForCausalLM >>> hparams_names = [ ... "vocab_size", ... "n_layer", ... "n_embd", ... "n_head", ... "n_positions", ... "attn_pdrop", ... "embd_pdrop", ... "resid_pdrop", ... "layer_norm_epsilon", ... ] ... >>> model_name = "gpt2" >>> model_official = AutoModelForCausalLM.from_pretrained(model_name, tie_word_embeddings=False) >>> config_official = model_official.config >>> config_official >>> config_ours = {name: getattr(config_official, name) for name in hparams_names} >>> config_ours >>> model_ours = GPT(**config_ours) >>> sum(p.numel() for p in model_ours.parameters()) >>> sum(p.numel() for p in model_official.parameters()) >>> _ = model_official.eval() >>> _ = model_ours.eval() >>> idx = torch.tensor([[1, 123, 52, 28]], dtype=torch.long) >>> logits_official = model_official(idx).logits >>> logits_ours = model_ours(idx) >>> logits_official.shape >>> logits_ours.shape >>> torch.allclose(logits_ours, logits_official, rtol=0, atol=1e-3) >>> (logits_ours - logits_official).abs().max() >>> from utils import copy_model >>> copy_model(model_official, model_ours) >>> logits_official = model_official(idx).logits >>> logits_ours = model_ours(idx) >>> torch.allclose(logits_ours, logits_official, rtol=0, atol=1e-3) >>> (logits_ours - logits_official).abs().max() ================================================ FILE: github_adventures/gpt/model.py ================================================ import torch import torch.nn as nn from transformers.activations import gelu_new class CustomGELU(nn.Module): """GELU implementation taken from the `transformers`.""" def forward(self, x): """Run forward pass.""" return gelu_new(x) class Block(nn.Module): """Decoder block. Parameters ---------- n_embd : int Dimensionality of the embeddings. n_head : int Number of attention heads. n_positions : int Maximum number of tokens. attn_pdrop : float Probability of dropout on attention weights. resid_pdrop : float Probability of dropout after applying the MLP. layer_norm_epsilon : float Hyperparameter of layer normalization. Attributes ---------- ln_1, ln_2 : nn.LayerNorm Layer norms. attention : nn.MultiHeadAttention Attention module. mlp : nn.Sequential Multilayer perceptron. """ def __init__( self, *, n_embd, n_head, n_positions, attn_pdrop, resid_pdrop, layer_norm_epsilon, ): super().__init__() self.ln_1 = nn.LayerNorm(n_embd, eps=layer_norm_epsilon) self.ln_2 = nn.LayerNorm(n_embd, eps=layer_norm_epsilon) self.attention = nn.MultiheadAttention( embed_dim=n_embd, num_heads=n_head, dropout=attn_pdrop, bias=True, batch_first=True, ) self.register_buffer( "mask", (1 - torch.tril(torch.ones(n_positions, n_positions))).to( dtype=torch.bool ), ) self.mlp = nn.Sequential( nn.Linear(n_embd, 4 * n_embd), CustomGELU(), nn.Linear(4 * n_embd, n_embd), nn.Dropout(resid_pdrop), ) def forward(self, x): """Run forward pass. Parameters ---------- x : torch.Tensor Input tensor of shape `(batch_size, n_tokens, n_embd)`. Returns ------- torch.Tensor Output tensor of shape `(batch_size, n_tokens, n_embd)`. """ batch_size, n_tokens, n_embd = x.shape x_ = self.ln_1(x) # (batch_size, n_tokens, n_embd) mask = self.mask[:n_tokens, :n_tokens] # (n_tokens, n_tokens) attn_out, _ = self.attention( x_, x_, x_, attn_mask=mask, need_weights=False ) # (batch_size, n_tokens, n_embd) x = x + attn_out # (batch_size, n_tokens, n_embd) x = x + self.mlp(self.ln_2(x)) # (batch_size, n_tokens, n_embd) return x class GPT(nn.Module): """Entire GPT model. Parameters ---------- vocab_size : int Number of tokens in the vocabulary. n_layer : int Number of decoder blocks to include. n_embd : int Dimensionality of the embeddings. n_head : int Number of attention heads. n_positions : int Maximum number of tokens. attn_pdrop : float Probability of dropout on attention weights. embd_pdrop : float Probability of dropout on the sum of embeddings. resid_pdrop : float Probability of dropout after applying the MLP. layer_norm_epsilon : float Hyperparameter of layer normalization. Attributes ---------- token_emb : nn.Embedding Token embeddings. pos_emb : nn.Embedding Positional embedding. drop : nn.Dropout Dropout module to be applied on the sum of embeddings. blocks : nn.Sequential List of decoder blocks. ln : nn.LayerNorm Layer norm applied before applying `head`. head : nn.Linear Final linear layer. """ def __init__( self, *, vocab_size, n_layer, n_embd, n_head, n_positions, attn_pdrop, embd_pdrop, resid_pdrop, layer_norm_epsilon, ): super().__init__() self.n_positions = n_positions self.token_emb = nn.Embedding(vocab_size, n_embd) self.pos_emb = nn.Embedding(n_positions, n_embd) self.drop = nn.Dropout(embd_pdrop) self.blocks = nn.Sequential( *[ Block( n_embd=n_embd, n_head=n_head, n_positions=n_positions, attn_pdrop=attn_pdrop, resid_pdrop=resid_pdrop, layer_norm_epsilon=layer_norm_epsilon, ) for _ in range(n_layer) ] ) self.ln = nn.LayerNorm(n_embd, eps=layer_norm_epsilon) self.head = nn.Linear(n_embd, vocab_size, bias=False) def forward(self, idx): """Run forward pass. Parameters ---------- idx : torch.Tensor Integer tensor of shape `(batch_size, n_tokens)` where each element is in the range `[0, vocab_size)`. Returns ------- logits : torch.Tensor Tensor of shape `(batch_size, n_tokens, vocab_size)`. """ batch_size, n_tokens = idx.shape device = idx.device if n_tokens > self.n_positions: raise ValueError("There are too many tokens in the input") positions = torch.arange(n_tokens, device=device) # (n_tokens,) token_emb = self.token_emb(idx) # (batch_size, n_tokens, n_embd) pos_emb = self.pos_emb(positions)[None, ...] # (1, n_tokens, n_embd) x = self.drop(token_emb + pos_emb) # (batch_size, n_tokens, n_embd) x = self.blocks(x) # (batch_size, n_tokens, n_embd) x = self.ln(x) # (batch_size, n_tokens, n_embd) logits = self.head(x) # (batch_size, n_tokens, vocab_size) return logits ================================================ FILE: github_adventures/gpt/requirements.txt ================================================ ipython==7.30.1 ipywidgets==7.6.5 jupyter==1.0.0 matplotlib==3.5.1 numpy==1.21.5 torch==1.10.1 -e git+https://github.com/huggingface/transformers.git@05fa1a7ac17bb7aa07b9e0c1e138ecb31a28bbfe#egg=transformers ================================================ FILE: github_adventures/gpt/utils.py ================================================ import torch def copy_parameter(param_official, param_ours): """Copy values of one tensor to another tensor. Parameters ---------- param_official : torch.Tensor The value of this tensor will be copied. param_ours : torch.Tensor This tensor will be overwritten in-place with the values from `param_official`. """ if param_official.shape != param_ours.shape: raise ValueError("The shapes of the provided tensors are different") with torch.no_grad(): param_ours.copy_(param_official) def copy_block(block_official, block_ours): """Copy all parameters within a transformer block. Parameters ---------- block_official : transformers.models.gpt2.modeling_gpt2.GPT2Block Block coming from the huggingface code. block_ours : Block Our block. """ b_a = block_official b_b = block_ours # LN 1 copy_parameter(b_a.ln_1.weight, b_b.ln_1.weight) copy_parameter(b_a.ln_1.bias, b_b.ln_1.bias) # Attention copy_parameter(b_a.attn.c_attn.weight.T, b_b.attention.in_proj_weight) copy_parameter(b_a.attn.c_attn.bias, b_b.attention.in_proj_bias) copy_parameter(b_a.attn.c_proj.weight.T, b_b.attention.out_proj.weight) copy_parameter(b_a.attn.c_proj.bias, b_b.attention.out_proj.bias) # LN 2 copy_parameter(b_a.ln_2.weight, b_b.ln_2.weight) copy_parameter(b_a.ln_2.bias, b_b.ln_2.bias) # MLP copy_parameter(b_a.mlp.c_fc.weight.T, b_b.mlp[0].weight) copy_parameter(b_a.mlp.c_fc.bias, b_b.mlp[0].bias) copy_parameter(b_a.mlp.c_proj.weight.T, b_b.mlp[2].weight) copy_parameter(b_a.mlp.c_proj.bias, b_b.mlp[2].bias) def copy_model(model_official, model_ours): """Copy all trainable weights. Parameters ---------- model_official : transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel Huggingface model. model_ours : GPT Our model. """ m_a = model_official m_b = model_ours # Token and positional embeddings copy_parameter(m_a.transformer.wpe.weight, m_b.pos_emb.weight) copy_parameter(m_a.transformer.wte.weight, m_b.token_emb.weight) # Blocks for block_official, block_ours in zip(m_a.transformer.h, m_b.blocks): copy_block(block_official, block_ours) # Head copy_parameter(m_a.transformer.ln_f.weight, m_b.ln.weight) copy_parameter(m_a.transformer.ln_f.bias, m_b.ln.bias) copy_parameter(m_a.lm_head.weight, m_b.head.weight) @torch.no_grad() def generate_token( model, token_ixs, temperature=1.0, sample=False, top_k=None ): """Generate a single token given previous tokens. Parameters ---------- model : GPT Our GPT model. token_ixs : list List of conditional input token ids. temperature : float The higher the more variability and vice versa. sample : bool If True, we sample from the distribution (=there is randomness). If False, we just take the argmax (=there is no randomness). top_k : int or None If not None then we modify the distribution to only contain the `top_k` most probable outcomes. Returns ------- new_token_ix : int Index of the new token. """ context_token_ixs = token_ixs[-model.n_positions :] ixs = torch.tensor(context_token_ixs).to(dtype=torch.long)[ None, : ] # (1, n_tokens) logits_all = model(ixs) # (1, n_tokens, vocab_size) logits = logits_all[0, -1, :] # (vocab_size,) logits = logits / temperature # (vocab_size,) if top_k is not None: # Find the top k biggest elements, set the remaining elements to -inf top_values, _ = torch.topk(logits, top_k) # (top_k,) logits[logits < top_values.min()] = -torch.inf probs = torch.nn.functional.softmax(logits, dim=0) # (vocab_size,) if sample: new_token_ix = torch.multinomial(probs, num_samples=1) else: new_token_ix = probs.argmax() return new_token_ix.item() ================================================ FILE: github_adventures/integer/README.md ================================================ # On-line encyclopedia of integer sequences You can use the `fetch_data.py` to download the sequences. However, I actually found out (after filming the video) that you can literally download all the sequences here: https://oeis.org/wiki/Welcome#Compressed_Versions So you should probably do that and spare their API. # The GloVe embeddings The one that I used in the video are located here: https://nlp.stanford.edu/data/glove.6B.zip ================================================ FILE: github_adventures/integer/bert.py ================================================ import argparse import numpy as np import torch from torch.utils.tensorboard import SummaryWriter from transformers import BertModel, BertTokenizer from utils import create_classification_targets, train_classifier def main(argv=None): parser = argparse.ArgumentParser("Evaluating BERT integer embeddings") parser.add_argument( "log_folder", type=str, help="Folder where to log results", ) parser.add_argument( "--max-value-eval", type=int, default=500, help="Number of integers to run the evaluation on", ) args = parser.parse_args(argv) model_name = "bert-base-uncased" # Create writer writer = SummaryWriter(args.log_folder) tokenizer = BertTokenizer.from_pretrained(model_name) model = BertModel.from_pretrained(model_name) # Retrieve embeddings to_find = list(map(str, range(args.max_value_eval))) positions = np.array(tokenizer.convert_tokens_to_ids(to_find)) unk_token_position = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) is_valid = positions != unk_token_position print( "The following numbers are missing", [i for i, x in enumerate(is_valid) if not x], ) arange = np.arange(args.max_value_eval) numbers = arange[is_valid] embeddings = ( model.embeddings.word_embeddings(torch.from_numpy(positions[is_valid])) .detach() .numpy() ) ys_clf = create_classification_targets(numbers) keys = sorted(ys_clf.keys()) metadata = np.array([numbers] + [ys_clf[k] for k in keys]).T.tolist() metadata_header = ["value"] + keys for name, y in ys_clf.items(): metrics = train_classifier(embeddings, y) for metric_name, value in metrics.items(): writer.add_scalar( f"{name}/{metric_name}", value, ) writer.add_embedding( embeddings, metadata=metadata, metadata_header=metadata_header, ) if __name__ == "__main__": main() ================================================ FILE: github_adventures/integer/experiments.sh ================================================ set -x OUTPUT_PATH=results GLOVE_PATH=glove.6B.300d.txt SEQUENCES_PATH=raw_data.pkl MAX_VALUE_EVAL=500 python glove.py --max-value-eval $MAX_VALUE_EVAL $GLOVE_PATH $OUTPUT_PATH/glove python bert.py --max-value-eval $MAX_VALUE_EVAL $OUTPUT_PATH/BERT python lstm.py \ $SEQUENCES_PATH \ $OUTPUT_PATH/LSTM \ --batch-size 128 \ --device cuda \ --embedding-dim 128 \ --hidden-dim 256 \ --max-value-eval $MAX_VALUE_EVAL \ --max-value 20000 \ --n-epochs 20000 \ --sequence-len 100 ================================================ FILE: github_adventures/integer/fetch_data.py ================================================ import pathlib import pickle import requests from joblib import Parallel, delayed, parallel_backend def get_sequence(sequence_id): """Get an integer sequence from the online OEIS. Parameters ---------- sequence_id : int Unique identifier for the desired sequence. Returns ------- sequence : list List of integers Raises ------ HTTPError Was not possible to get the given sequence """ url = f"https://oeis.org/search?fmt=json&q=id:A{sequence_id:07}" print(sequence_id) response = requests.get(url) response.raise_for_status() data_str = response.json()["results"][0]["data"] sequence = [int(x) for x in data_str.split(",")] return sequence if __name__ == "__main__": # Parameters n_sequences = 5000 start_id = 1 # seems like 1 - 340_000 are valid sequences n_jobs = 64 backend = "threading" # "threading" or "loky" # Preparation end_id = start_id + n_sequences output_folder = pathlib.Path("data/") output_folder.mkdir(exist_ok=True, parents=True) output_path = output_folder / f"{start_id}_{end_id - 1}.pkl" with parallel_backend(backend, n_jobs=n_jobs): res = Parallel()(delayed(get_sequence)(i) for i in range(start_id, end_id)) with output_path.open("wb") as f: pickle.dump(res, f) ================================================ FILE: github_adventures/integer/glove.py ================================================ import argparse import numpy as np from torch.utils.tensorboard import SummaryWriter from utils import create_classification_targets, train_classifier def main(argv=None): parser = argparse.ArgumentParser("Evaluating GloVe integer embeddings") parser.add_argument( "glove_path", type=str, help="Path to a txt file holding the GloVe embeddings", ) parser.add_argument( "log_folder", type=str, help="Folder where to log results", ) parser.add_argument( "--max-value-eval", type=int, default=500, help="Number of integers to run the evaluation on", ) parser.add_argument( "--dim", type=int, default=300, help="Dimensionality of the embeddings", ) args = parser.parse_args() # Create writer writer = SummaryWriter(args.log_folder) # Retrieve embeddings to_find = set(map(str, range(args.max_value_eval))) embeddings = np.empty((args.max_value_eval, args.dim)) with open(args.glove_path) as f: for line in f: token, *vector_ = line.split(" ") if token in to_find: embeddings[int(token)] = list(map(float, vector_)) to_find.remove(token) assert not to_find arange = np.arange(args.max_value_eval) ys_clf = create_classification_targets(arange) keys = sorted(ys_clf.keys()) metadata = np.array([arange] + [ys_clf[k] for k in keys]).T.tolist() metadata_header = ["value"] + keys for name, y in ys_clf.items(): metrics = train_classifier(embeddings, y) for metric_name, value in metrics.items(): writer.add_scalar( f"{name}/{metric_name}", value, ) writer.add_embedding( embeddings, metadata=metadata, metadata_header=metadata_header, ) if __name__ == "__main__": main() ================================================ FILE: github_adventures/integer/lstm.py ================================================ import argparse import json import pathlib import pickle import matplotlib.pyplot as plt import numpy as np import torch import torch.nn as nn import tqdm from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter from utils import ( CustomDataset, Network, create_classification_targets, train_classifier, ) def main(argv=None): parser = argparse.ArgumentParser("Embedding integers using LSTM") parser.add_argument( "data_path", type=str, help="Path to the pickled sequences" ) parser.add_argument( "log_folder", type=str, help="Folder where to log results" ) parser.add_argument( "-b", "--batch-size", type=int, default=128, help="Batch size" ) parser.add_argument( "-d", "--dense-dim", type=int, default=256, help="Dense dimension" ) parser.add_argument("--device", type=str, default="cpu", help="Device") parser.add_argument( "-e", "--embedding-dim", type=int, default=128, help="Embedding dimension", ) parser.add_argument( "--hidden-dim", type=int, default=256, help="Hidden dimension" ) parser.add_argument( "--max-value-eval", type=int, default=500, help="Evaluation limit", ) parser.add_argument( "-m", "--max-value", type=int, default=20000, help="The maximum allowed value (non inclusive)", ) parser.add_argument( "-n", "--n-epochs", type=int, default=100, help="Number of epochs" ) parser.add_argument( "-l", "--sequence-len", type=int, default=100, help="The maximum length of a sequence", ) args = parser.parse_args(argv) # Preparations device = torch.device(args.device) eval_frequency = 500 log_folder = pathlib.Path(args.log_folder) model_path = log_folder / "checkpoint.pth" writer = SummaryWriter(log_folder) writer.add_text("parameters", json.dumps(vars(args))) # Dataset related data_path = pathlib.Path(args.data_path) with data_path.open("rb") as f: raw_sequences = pickle.load(f) dataset = CustomDataset( raw_sequences, max_value=args.max_value, sequence_len=args.sequence_len, ) fig, ax = plt.subplots() ax.hist(dataset.normalized_sequences.ravel(), bins=100) ax.set_title( f"Number distribution (numbers={dataset.normalized_sequences.shape})" ) writer.add_figure("number distribution", fig) dataloader = DataLoader( dataset, shuffle=True, batch_size=args.batch_size, pin_memory=True, ) # Newtork, loss and the optimizer net = Network( max_value=args.max_value, hidden_dim=args.hidden_dim, embedding_dim=args.embedding_dim, dense_dim=args.dense_dim, ) net.to(device) loss_inst = nn.CrossEntropyLoss( ignore_index=args.max_value, ) optimizer = torch.optim.Adam(net.parameters()) # Validation preparation max_value_eval = args.max_value_eval or args.max_value arange = np.arange(max_value_eval) ys_clf = create_classification_targets(arange) keys = sorted(ys_clf.keys()) metadata = np.array([arange] + [ys_clf[k] for k in keys]).T.tolist() metadata_header = ["value"] + keys step = 0 for _ in range(args.n_epochs): for x in tqdm.tqdm(dataloader): x = x.to(device) logits_ = net(x) # (batch_size, sequence_len, max_value) logits = logits_[:, :-1].permute( 0, 2, 1 ) # (batch_size, max_value, sequence_len - 1) target = x[:, 1:] # (batch_size, sequence_len - 1) loss = loss_inst(logits, target) optimizer.zero_grad() loss.backward() optimizer.step() writer.add_scalar("loss", loss, step) if step % eval_frequency == 0: X = ( net.embedding.weight.detach() .cpu() .numpy()[:max_value_eval] ) writer.add_embedding( X, global_step=step, tag="Integer embeddings", metadata=metadata, metadata_header=metadata_header, ) for name, y in ys_clf.items(): metrics = train_classifier(X, y) for metric_name, value in metrics.items(): writer.add_scalar( f"{name}/{metric_name}", value, step, ) torch.save(net, model_path) step += 1 if __name__ == "__main__": main() ================================================ FILE: github_adventures/integer/requirements.txt ================================================ joblib matplotlib numpy requests scikit-learn sympy tensorboard torch transformers ================================================ FILE: github_adventures/integer/utils.py ================================================ import numpy as np import torch import torch.nn as nn from sklearn.linear_model import LogisticRegression from sklearn.model_selection import StratifiedKFold, cross_validate from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sympy.ntheory import isprime from torch.utils.data import Dataset class CustomDataset(Dataset): """Dataset containing integer sequences. Parameters ---------- raw_sequences : list of list of str Containing the original raw sequences. Note that their length differs. sequence_len : int The lenght og the sequence. If the original sequence is shorter, we just pad it with `max_value`. If the original sequence is longer we simply cut if off. max_value : int The maximum allowed value (non inclusive). We will only consider sequences that had the first `sequence_len` elements in the range `[0, max_value)`. Attributes ---------- normalized_sequences : np.ndarray 2D array of shape `(n_sequences, sequence_len)`. It only contains sequences that had the first `sequence_len` elements in the range `[0, max_value)`. """ def __init__( self, raw_sequences, sequence_len=80, max_value=2000, ): filtered_sequences = list( filter( lambda seq: all( 0 <= x < max_value for x in seq[:sequence_len] ), raw_sequences, ) ) n_sequences = len(filtered_sequences) self.normalized_sequences = max_value * np.ones( (n_sequences, sequence_len), dtype=np.int64, ) for i, seq in enumerate(filtered_sequences): actual_len = min(len(seq), sequence_len) self.normalized_sequences[i, :actual_len] = seq[:actual_len] def __len__(self): """Get the length of the dataset.""" return len(self.normalized_sequences) def __getitem__(self, ix): """Get a single sample of the dataset.""" return self.normalized_sequences[ix] class Network(nn.Module): """Network predicting next number in the sequence. Parameters ---------- max_value : int Maximum integer value allowed inside of the sequence. We will generate an embedding for each of the numbers in `[0, max_value]`. embedding_dim : int Dimensionality of the integer embeddings. n_layers : int Number of layers inside of the LSTM. hidden_dim : int Dimensionality of the hidden state (LSTM). dense_dim : int Dimensionality of the dense layer. Attributes ---------- embedding : torch.nn.Embedding Embeddings of all the integers. lstm : torch.nn.LSTM LSTM subnetwork. Inputs integer embeddings and outputs new hidden states. linear : torch.nn.Linear Inputs hidden states and tranforms them. classifier : torch.nn.Linear Inputs outputs of the `linear` and outputs the logits over all possible integers. """ def __init__( self, max_value=2000, embedding_dim=100, n_layers=2, hidden_dim=64, dense_dim=256, ): super().__init__() self.embedding = nn.Embedding( num_embeddings=max_value + 1, embedding_dim=embedding_dim, padding_idx=max_value, ) self.lstm = nn.LSTM( input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True, ) self.linear = nn.Linear( hidden_dim, dense_dim, ) self.classifier = nn.Linear( dense_dim, max_value, ) def forward(self, x): """Run forward pass. Parameters ---------- x : torch.Tensor Input tensor of shape `(batch_size, sequence_len)` and has dtype `torch.long`. Returns ------- logits : torch.Tensor Logits over all possible integers of shape `(batch_size, sequence_len, max_value)`. """ emb = self.embedding(x) # (batch_size, sequence_len, embedding_dim) h, *_ = self.lstm(emb) # (batch_size, sequence_len, hidden_dim) dense = torch.relu( self.linear(h) ) # (batch_size, sequence_len, dense_dim) logits = self.classifier( dense ) # (batch_size, sequence_len, max_value) return logits def train_classifier(X, y, random_state=2): """Cross-validate classification problem using logistic regression. Parameters ---------- X : np.ndarray 2D array holding the features of shape `(n_samples, n_features)`. y : np.ndarray 1D array holding the classification targets of shape `(n_samples,)`. random_state : int Guaranteeing reproducibility. Returns ------- metrics : dict Holds train and validation accuracy averaged over all the folds. """ cv = StratifiedKFold( n_splits=5, random_state=random_state, shuffle=True, ) clf = make_pipeline( StandardScaler(), LogisticRegression( max_iter=2000, random_state=random_state, ), ) res = cross_validate( clf, X, y, return_train_score=True, cv=cv, ) metrics = { "train_acc": res["train_score"].mean(), "test_acc": res["test_score"].mean(), } return metrics def create_classification_targets(indices): """Create multiple classification targets. They represent common properties of integers. Parameters ---------- indices : np.ndarray 1D array holding the integers for which we want to compute the targets. Returns ------- targets : dict Keys are property names and the values are arrays of the same shape as `indices` representing whether a given integer does / does not have a given property. """ targets = { "divisibility_2": (indices % 2 == 0).astype(float), "divisibility_3": (indices % 3 == 0).astype(float), "divisibility_4": (indices % 4 == 0).astype(float), "divisibility_5": (indices % 5 == 0).astype(float), "divisibility_10": (indices % 10 == 0).astype(float), "prime": np.vectorize(isprime)(indices).astype(float), } return targets ================================================ FILE: github_adventures/lottery/README.md ================================================ # The Lottery Ticket Hypothesis ## Installation ```bash pip install -r requirements.txt ``` ## Running experiments The training logic is implemented inside of the script `main.py`. To get more information about the CLI run ```bash python main.py --help ``` If you want to run an entire grid search over different hyperparameters you can use the `parallel_launch.sh` script. Note that it depends on a tool called `parallel` ([more info](https://www.gnu.org/software/parallel/)). Note that the script allows for dry runs (default behavior) and progress bars. ```bash ./parallel_launch.sh ``` ================================================ FILE: github_adventures/lottery/data.py ================================================ from torch.utils.data import Dataset from torchvision.datasets import MNIST from torchvision.transforms import Compose, Lambda, ToTensor class MNISTDataset(Dataset): """MNIST dataset. Feature images are automatically flattened. Parameters ---------- root : str Directory where the actual data is located (or downloaded to). train : bool If True the training set is returned (60_000 samples). Otherwise the validation set is returned (10_000 samples). Attributes ---------- tv_dataset : MNIST Instance of the torchvision `MNIST` dataset class. """ def __init__(self, root, train=True, download=True): transform = Compose( [ ToTensor(), Lambda(lambda x: x.ravel()), ] ) self.tv_dataset = MNIST( root, train=train, download=download, transform=transform, ) def __len__(self): """Get the length of the dataset.""" return len(self.tv_dataset) def __getitem__(self, ix): """Get a selected sample. Parameters ---------- ix : int Index of the sample to get. Returns ------- x : torch.Tensor Flattened feature tensor of shape `(784,)`. y : torch.Tensor Scalar representing the ground truth label. Number between 0 and 9. """ return self.tv_dataset[ix] ================================================ FILE: github_adventures/lottery/main.py ================================================ import argparse import torch import torch.nn as nn import tqdm from torch.utils.data import DataLoader import wandb from data import MNISTDataset from utils import MLP, compute_stats, copy_weights_mlp, prune_mlp, reinit_mlp def loop_dataloader(dataloader): """Loop infinitely over a dataloader. Parameters ---------- dataloader : DataLoader DataLoader streaming batches of samples. Yields ------ X_batch : torch.Tensor Batch of features. y_batch : torch.Tensor Batch of predictions. """ while True: for x in iter(dataloader): yield x def train( model, dataloader_train, loss_inst, optimizer, max_iter=10_000, dataloader_val=None, val_freq=500, ): """Run the training loop. Parameters ---------- model : nn.Module Neural network (in our case MLP). dataloader_train : DataLoader Dataloader yielding training samples. loss_inst : callable Computes the loss when called. optimizer : torch.optim.Optimizer Instance of an optimizer. max_iter : int The number of iterations we run the training for (= number of graident descent steps). dataloader_val : None or DataLoader Dataloader yielding validation samples. If provided it will also single to us that we want to track metrics. val_freq : int How often evaluation run. """ iterable = loop_dataloader(dataloader_train) iterable = tqdm.tqdm(iterable, total=max_iter) it = 0 for X_batch, y_batch in iterable: if it == max_iter: break logit_batch = model(X_batch) loss = loss_inst(logit_batch, y_batch) if dataloader_val is not None: wandb.log({"loss": loss}, step=it) optimizer.zero_grad() loss.backward() optimizer.step() if it % val_freq == 0 and dataloader_val is not None: is_equal = [] for X_batch_val, y_batch_val in dataloader_val: is_equal.append( model(X_batch_val).argmax(dim=-1) == y_batch_val ) is_equal_t = torch.cat(is_equal) acc = is_equal_t.sum() / len(is_equal_t) wandb.log({"accuracy_val": acc}, step=it) it += 1 def main(argv=None): """Create CLI and run experiments.""" parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument( "-i", "--max-iter", help="Number of iterations", type=int, default=50000, ) parser.add_argument( "-b", "--batch-size", help="Batch size", type=int, default=60, ) parser.add_argument( "--prune-iter", help="Number of prune iterations", type=int, default=1, ) parser.add_argument( "-m", "--prune-method", help="Pruning method to employ", type=str, choices=("l1", "random"), default="l1", ) parser.add_argument( "-p", "--prune-ratio", help="Percentage of weights to remove", type=float, default=0.2, ) parser.add_argument( "--val-freq", help="How often to compute the validation accuracy", type=int, default=250, ) parser.add_argument( "-r", "--reinitialize", help="If true, reinitializes randomly all weights after pruning", type=str, choices=("true", "false"), # easy for hyperparameter search default="false", ) parser.add_argument( "-s", "--random-state", help="Random state", type=int, ) parser.add_argument( "--wandb-entity", help="W&B entity", type=str, default="mildlyoverfitted", ) parser.add_argument( "--wandb-project", help="W&B project", type=str, ) args = parser.parse_args(argv) wandb.init( project=args.wandb_project, entity=args.wandb_entity, config=vars(args), ) wandb.define_metric("accuracy_val", summary="max") dataset_train = MNISTDataset( "data", train=True, download=True, ) dataset_val = MNISTDataset( "data", train=False, download=True, ) if args.random_state is not None: torch.manual_seed(args.random_state) dataloader_train = DataLoader( dataset_train, batch_size=args.batch_size, shuffle=True ) dataloader_val = DataLoader( dataset_val, batch_size=args.batch_size, shuffle=True ) kwargs = dict( n_features=28 * 28, hidden_layer_sizes=(300, 100), n_targets=10, ) mlp = MLP(**kwargs) mlp_copy = MLP(**kwargs) mlp_copy.load_state_dict(mlp.state_dict()) loss_inst = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(mlp.parameters(), lr=1.2 * 1e-3) # Train and prune loop if args.prune_ratio > 0: per_round_prune_ratio = 1 - (1 - args.prune_ratio) ** ( 1 / args.prune_iter ) per_round_prune_ratios = [per_round_prune_ratio] * len(mlp.module_list) per_round_prune_ratios[-1] /= 2 per_round_max_iter = int(args.max_iter / args.prune_iter) for prune_it in range(args.prune_iter): train( mlp, dataloader_train, loss_inst, optimizer, max_iter=per_round_max_iter, ) prune_mlp(mlp, per_round_prune_ratios, method=args.prune_method) copy_weights_mlp(mlp_copy, mlp) stats = compute_stats(mlp) for name, stat in stats.items(): summary_name = f"{name}_pruneiter={prune_it}" wandb.run.summary[summary_name] = stat if args.reinitialize == "true": reinit_mlp(mlp) # Run actual training with a final pruned network train( mlp, dataloader_train, loss_inst, optimizer, max_iter=args.max_iter, dataloader_val=dataloader_val, val_freq=args.val_freq, ) if __name__ == "__main__": main() ================================================ FILE: github_adventures/lottery/parallel_launch.sh ================================================ # Parallel parameters N_JOBS=4 ARGS="-P$N_JOBS --header :" # arguments for parallel # ARGS="--bar "$ARGS ARGS="--dry-run "$ARGS # Experiment parameters ENTITY='mildlyoverfitted' PROJECT='lottery_parallel_2' # it should already exist to avoid issues MAX_ITERS=(15000) PRUNE_ITERS=(1 5) PRUNE_METHODS=('l1' 'random') PRUNE_RATIOS=(0 0.1 0.25 0.5 0.8 0.9 0.93 0.97) REINITIALIZES=('true' 'false') RANDOM_STATES=(1 2 3 4 5) parallel $ARGS \ python main.py \ --max-iter={max_iter} \ --prune-iter={prune_iter} \ --prune-method={prune_method} \ --prune-ratio={prune_ratio} \ --random-state={random_state} \ --reinitialize={reinitialize} \ --wandb-entity=$ENTITY \ --wandb-project=$PROJECT \ ::: max_iter "${MAX_ITERS[@]}" \ ::: prune_iter "${PRUNE_ITERS[@]}" \ ::: prune_method "${PRUNE_METHODS[@]}" \ ::: prune_ratio "${PRUNE_RATIOS[@]}" \ ::: random_state "${RANDOM_STATES[@]}" \ ::: reinitialize "${REINITIALIZES[@]}" \ ================================================ FILE: github_adventures/lottery/requirements.txt ================================================ numpy pillow six torch torch-vision tqdm wandb ================================================ FILE: github_adventures/lottery/utils.py ================================================ import math import torch import torch.nn as nn from torch.nn.utils.prune import l1_unstructured, random_unstructured class MLP(nn.Module): """Multilayer perceptron. The bias is included in all linear layers. Parameters ---------- n_features : int Number of input features (pixels inside of MNIST images). hidden_layer_sizes : tuple Tuple of ints representing sizes of the hidden layers. n_targets : int Number of target classes (10 for MNIST). Attributes ---------- module_list : nn.ModuleList List holding all the linear layers in the right order. """ def __init__(self, n_features, hidden_layer_sizes, n_targets): super().__init__() layer_sizes = (n_features,) + hidden_layer_sizes + (n_targets,) layer_list = [] for i in range(len(layer_sizes) - 1): layer_list.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1])) self.module_list = nn.ModuleList(layer_list) def forward(self, x): """Run the forward pass. Parameters ---------- x : torch.Tensor Batch of features of shape `(batch_size, n_features)`. Returns ------- torch.Tensor Batch of predictions (logits) of shape `(batch_size, n_targets)`. """ n_layers = len(self.module_list) for i, layer in enumerate(self.module_list): x = layer(x) if i < n_layers - 1: x = nn.functional.relu(x) return x def prune_linear(linear, prune_ratio=0.3, method="l1"): """Prune a linear layer. Modifies the module in-place. We make an assumption that the bias is included. Parameters ---------- linear : nn.Linear Linear module containing a bias. prune_ratio : float Number between 0 and 1 representing the percentage of weights to prune. method : str, {"l1", "random"} Pruning method to use. """ if method == "l1": prune_func = l1_unstructured elif method == "random": prune_func = random_unstructured else: raise ValueError prune_func(linear, "weight", prune_ratio) prune_func(linear, "bias", prune_ratio) def prune_mlp(mlp, prune_ratio=0.3, method="l1"): """Prune each layer of the multilayer perceptron. Modifies the module in-place. We make an assumption that each linear layer has the bias included. Parameters ---------- mlp : MLP Multilayer perceptron instance. prune_ratio : float or list Number between 0 and 1 representing the percentage of weights to prune. If `list` then different ratio for each layer. method : str, {"l1", "random"} Pruning method to use. """ if isinstance(prune_ratio, float): prune_ratios = [prune_ratio] * len(mlp.module_list) elif isinstance(prune_ratio, list): if len(prune_ratio) != len(mlp.module_list): raise ValueError("Incompatible number of prune ratios provided") prune_ratios = prune_ratio else: raise TypeError for prune_ratio, linear in zip(prune_ratios, mlp.module_list): prune_linear(linear, prune_ratio=prune_ratio, method=method) def check_pruned_linear(linear): """Check if a Linear module was pruned. We require both the bias and the weight to be pruned. Parameters ---------- linear : nn.Linear Linear module containing a bias. Returns ------- bool True if the model has been pruned. """ params = {param_name for param_name, _ in linear.named_parameters()} expected_params = {"weight_orig", "bias_orig"} return params == expected_params def reinit_linear(linear): """Reinitialize a linear layer. This is an in-place operation. If the module has some pruning logic we are not going to remove it and we only initialize the underlying tensors - `weight_orig` and `bias_orig`. Parameters ---------- linear : nn.Linear Linear model containing a bias. """ is_pruned = check_pruned_linear(linear) # Get parameters of interest if is_pruned: weight = linear.weight_orig bias = linear.bias_orig else: weight = linear.weight bias = linear.bias # Initialize weight nn.init.kaiming_uniform_(weight, a=math.sqrt(5)) # Initialize bias fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weight) bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 nn.init.uniform_(bias, -bound, bound) def reinit_mlp(mlp): """Reinitialize all layers of the MLP. Parameters ---------- mlp : MLP Multi-layer perceptron. """ for linear in mlp.module_list: reinit_linear(linear) def copy_weights_linear(linear_unpruned, linear_pruned): """Copy weights from an unpruned model to a pruned model. Modifies `linear_pruned` in place. Parameters ---------- linear_unpruned : nn.Linear Linear model with a bias that was not pruned. linear_pruned : nn.Linear Linear model with a bias that was pruned. """ assert check_pruned_linear(linear_pruned) assert not check_pruned_linear(linear_unpruned) with torch.no_grad(): linear_pruned.weight_orig.copy_(linear_unpruned.weight) linear_pruned.bias_orig.copy_(linear_unpruned.bias) def copy_weights_mlp(mlp_unpruned, mlp_pruned): """Copy weights of an unpruned network to a pruned network. Modifies `mlp_pruned` in place. Parameters ---------- mlp_unpruned : MLP MLP model that was not pruned. mlp_pruned : MLP MLP model that was pruned. """ zipped = zip(mlp_unpruned.module_list, mlp_pruned.module_list) for linear_unpruned, linear_pruned in zipped: copy_weights_linear(linear_unpruned, linear_pruned) def compute_stats(mlp): """Compute important statistics related to pruning. Parameters ---------- mlp : MLP Multilayer perceptron. Returns ------- dict Statistics. """ stats = {} total_params = 0 total_pruned_params = 0 for layer_ix, linear in enumerate(mlp.module_list): assert check_pruned_linear(linear) weight_mask = linear.weight_mask bias_mask = linear.bias_mask params = weight_mask.numel() + bias_mask.numel() pruned_params = (weight_mask == 0).sum() + (bias_mask == 0).sum() total_params += params total_pruned_params += pruned_params stats[f"layer{layer_ix}_total_params"] = params stats[f"layer{layer_ix}_pruned_params"] = pruned_params stats[f"layer{layer_ix}_actual_prune_ratio"] = pruned_params / params stats["total_params"] = total_params stats["total_pruned_params"] = total_pruned_params stats["actual_prune_ratio"] = total_pruned_params / total_params return stats ================================================ FILE: github_adventures/mixer/README.md ================================================ Note that the `official.py` is just a copy of the code provided in `https://arxiv.org/abs/2105.01601` and probably here `https://github.com/google-research/vision_transformer`. Please refer to those sources for licensing information. ================================================ FILE: github_adventures/mixer/official.py ================================================ import einops import flax.linen as nn import jax.numpy as jnp class MlpBlock(nn.Module): mlp_dim: int @nn.compact def __call__(self, x): y = nn.Dense(self.mlp_dim)(x) y = nn.gelu(y) return nn.Dense(x.shape[-1])(y) class MixerBlock(nn.Module): tokens_mlp_dim: int channels_mlp_dim: int @nn.compact def __call__(self, x): y = nn.LayerNorm()(x) # (n_samples, n_patches, hidden_dim) y = jnp.swapaxes(y, 1, 2) y = MlpBlock(self.tokens_mlp_dim, name="token_mixing")(y) y = jnp.swapaxes(y, 1, 2) x = x + y y = nn.LayerNorm()(x) return x + MlpBlock(self.channels_mlp_dim, name="channel_mixing")(y) class MlpMixer(nn.Module): num_classes: int num_blocks: int patch_size: int hidden_dim: int tokens_mlp_dim: int channels_mlp_dim: int @nn.compact def __call__(self, x): s = self.patch_size x = nn.Conv(self.hidden_dim, (s, s), strides=(s, s), name="stem")(x) x = einops.rearrange(x, "n h w c -> n (h w) c") for _ in range(self.num_blocks): x = MixerBlock(self.tokens_mlp_dim, self.channels_mlp_dim)(x) x = nn.LayerNorm(name="pre_head_layer_norm")(x) x = jnp.mean(x, axis=1) return nn.Dense( self.num_classes, name="head", kernel_init=nn.initializers.zeros )(x) ================================================ FILE: github_adventures/mixer/ours.py ================================================ import einops import torch.nn as nn class MlpBlock(nn.Module): """Multilayer perceptron. Parameters ---------- dim : int Input and output dimension of the entire block. Inside of the mixer it will either be equal to `n_patches` or `hidden_dim`. mlp_dim : int Dimension of the hidden layer. Attributes ---------- linear_1, linear_2 : nn.Linear Linear layers. activation : nn.GELU Activation. """ def __init__(self, dim, mlp_dim=None): super().__init__() mlp_dim = dim if mlp_dim is None else mlp_dim self.linear_1 = nn.Linear(dim, mlp_dim) self.activation = nn.GELU() self.linear_2 = nn.Linear(mlp_dim, dim) def forward(self, x): """Run the forward pass. Parameters ---------- x : torch.Tensor Input tensor of shape `(n_samples, n_channels, n_patches)` or `(n_samples, n_patches, n_channels)`. Returns ------- torch.Tensor Output tensor that has exactly the same shape as the input `x`. """ x = self.linear_1(x) # (n_samples, *, mlp_dim) x = self.activation(x) # (n_samples, *, mlp_dim) x = self.linear_2(x) # (n_samples, *, dim) return x class MixerBlock(nn.Module): """Mixer block that contains two `MlpBlock`s and two `LayerNorm`s. Parameters ---------- n_patches : int Number of patches the image is split up into. hidden_dim : int Dimensionality of patch embeddings. tokens_mlp_dim : int Hidden dimension for the `MlpBlock` when doing token mixing. channels_mlp_dim : int Hidden dimension for the `MlpBlock` when doing channel mixing. Attributes ---------- norm_1, norm_2 : nn.LayerNorm Layer normalization. token_mlp_block : MlpBlock Token mixing MLP. channel_mlp_block : MlpBlock Channel mixing MLP. """ def __init__( self, *, n_patches, hidden_dim, tokens_mlp_dim, channels_mlp_dim ): super().__init__() self.norm_1 = nn.LayerNorm(hidden_dim) self.norm_2 = nn.LayerNorm(hidden_dim) self.token_mlp_block = MlpBlock(n_patches, tokens_mlp_dim) self.channel_mlp_block = MlpBlock(hidden_dim, channels_mlp_dim) def forward(self, x): """Run the forward pass. Parameters ---------- x : torch.Tensor Tensor of shape `(n_samples, n_patches, hidden_dim)`. Returns ------- torch.Tensor Tensor of the same shape as `x`, i.e. `(n_samples, n_patches, hidden_dim)`. """ y = self.norm_1(x) # (n_samples, n_patches, hidden_dim) y = y.permute(0, 2, 1) # (n_samples, hidden_dim, n_patches) y = self.token_mlp_block(y) # (n_samples, hidden_dim, n_patches) y = y.permute(0, 2, 1) # (n_samples, n_patches, hidden_dim) x = x + y # (n_samples, n_patches, hidden_dim) y = self.norm_2(x) # (n_samples, n_patches, hidden_dim) res = x + self.channel_mlp_block( y ) # (n_samples, n_patches, hidden_dim) return res class MlpMixer(nn.Module): """Entire network. Parameters ---------- image_size : int Height and width (assuming it is a square) of the input image. patch_size : int Height and width (assuming it is a square) of the patches. Note that we assume that `image_size % patch_size == 0`. tokens_mlp_dim : int Hidden dimension for the `MlpBlock` when doing the token mixing. channels_mlp_dim : int Hidden dimension for the `MlpBlock` when diong the channel mixing. n_classes : int Number of classes for classification. hidden_dim : int Dimensionality of patch embeddings. n_blocks : int The number of `MixerBlock`s in the architecture. Attributes ---------- patch_embedder : nn.Conv2D Splits the image up into multiple patches and then embeds each of them (using shared weights). blocks : nn.ModuleList List of `MixerBlock` instances. pre_head_norm : nn.LayerNorm Layer normalization applied just before the classification head. head_classifier : nn.Linear The classification head. """ def __init__( self, *, image_size, patch_size, tokens_mlp_dim, channels_mlp_dim, n_classes, hidden_dim, n_blocks, ): super().__init__() n_patches = (image_size // patch_size) ** 2 # assumes divisibility self.patch_embedder = nn.Conv2d( 3, hidden_dim, kernel_size=patch_size, stride=patch_size, ) self.blocks = nn.ModuleList( [ MixerBlock( n_patches=n_patches, hidden_dim=hidden_dim, tokens_mlp_dim=tokens_mlp_dim, channels_mlp_dim=channels_mlp_dim, ) for _ in range(n_blocks) ] ) self.pre_head_norm = nn.LayerNorm(hidden_dim) self.head_classifier = nn.Linear(hidden_dim, n_classes) def forward(self, x): """Run the forward pass. Parameters ---------- x : torch.Tensor Input batch of square images of shape `(n_samples, n_channels, image_size, image_size)`. Returns ------- torch.Tensor Class logits of shape `(n_samples, n_classes)`. """ x = self.patch_embedder( x ) # (n_samples, hidden_dim, n_patches ** (1/2), n_patches ** (1/2)) x = einops.rearrange( x, "n c h w -> n (h w) c" ) # (n_samples, n_patches, hidden_dim) for mixer_block in self.blocks: x = mixer_block(x) # (n_samples, n_patches, hidden_dim) x = self.pre_head_norm(x) # (n_samples, n_patches, hidden_dim) x = x.mean(dim=1) # (n_samples, hidden_dim) y = self.head_classifier(x) # (n_samples, n_classes) return y ================================================ FILE: github_adventures/mixer/test_compare.py ================================================ import jax import numpy as np import pytest import torch from official import MlpMixer as OfficialMixer from ours import MlpMixer as OurMixer @pytest.mark.parametrize("image_size", [6, 12]) @pytest.mark.parametrize("patch_size", [2, 3]) @pytest.mark.parametrize("hidden_dim", [4, 5]) @pytest.mark.parametrize("n_blocks", [1, 2]) @pytest.mark.parametrize("n_classes", [4, 8]) @pytest.mark.parametrize("tokens_mlp_dim", [2, 4]) @pytest.mark.parametrize("channels_mlp_dim", [3, 6]) def test_compare( image_size, patch_size, hidden_dim, n_blocks, n_classes, tokens_mlp_dim, channels_mlp_dim, ): # Create Flax model model_flax = OfficialMixer( num_classes=n_classes, num_blocks=n_blocks, patch_size=patch_size, hidden_dim=hidden_dim, tokens_mlp_dim=tokens_mlp_dim, channels_mlp_dim=channels_mlp_dim, ) key1, key2 = jax.random.split(jax.random.PRNGKey(0)) x = jax.random.normal(key1, (11, image_size, image_size, 3)) # Dummy input params = model_flax.init(key2, x) # initialization call n_params_flax = sum( jax.tree_leaves(jax.tree_map(lambda x: np.prod(x.shape), params)) ) shape_flax = model_flax.apply(params, x).shape # Create Torch model model_torch = OurMixer( image_size=image_size, patch_size=patch_size, hidden_dim=hidden_dim, n_blocks=n_blocks, n_classes=n_classes, tokens_mlp_dim=tokens_mlp_dim, channels_mlp_dim=channels_mlp_dim, ) n_params_torch = sum( p.numel() for p in model_torch.parameters() if p.requires_grad ) shape_torch = model_torch(torch.rand(11, 3, image_size, image_size)).shape assert n_params_flax == n_params_torch assert shape_flax == shape_torch == (11, n_classes) ================================================ FILE: github_adventures/mixup/launch_experiments.sh ================================================ set -x N_EPOCHS=100000 N_SAMPLES=1000 SEED=123 TBOARD_DIR=tb_results/$SEED python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/no_regularization python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/weight_decay --weight-decay 0.6 python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/dropout -p 0.2 python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/mixup --mixup python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/input_mixup -k 0 1 --mixup python train.py -r $SEED -n $N_EPOCHS -s $N_SAMPLES $TBOARD_DIR/hidden_layers_mixup -k 1 4 --mixup ================================================ FILE: github_adventures/mixup/train.py ================================================ import argparse import json import numpy as np import torch from sklearn.model_selection import train_test_split from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter from utils import ( CustomDataset, MLPClassifierMixup, generate_prediction_img, generate_spirals, ) def main(argv=None): parser = argparse.ArgumentParser("Training") # Parameters parser.add_argument( "logpath", type=str, ) parser.add_argument( "-b", "--batch-size", type=int, default=32, help="Batch size", ) parser.add_argument( "--mixup", action="store_true", ) parser.add_argument( "-p", "--dropout-probability", type=float, default=0, help="The probability of dropout", ) parser.add_argument( "--hidden-dims", nargs="+", type=int, default=(32, 32, 32), help="Hidden dimensions of the MLP", ) parser.add_argument( "-c", "--n-cycles", type=float, default=2, help="Number of cycles when creating the spiral dataset", ) parser.add_argument( "-n", "--n-epochs", type=int, default=100, help="Number of epochs", ) parser.add_argument( "-k", "--mixing-layer", type=int, nargs=2, default=(None, None), help="The range of k to sample from", ) parser.add_argument( "-s", "--n-samples", type=int, default=1000, help="Number of samples", ) parser.add_argument( "-r", "--random-state", type=int, default=5, help="Random state", ) parser.add_argument( "--weight-decay", type=float, default=0.0, help="Weight decay", ) args = parser.parse_args(argv) device = torch.device("cpu") dtype = torch.float32 np.random.seed(args.random_state) torch.manual_seed(args.random_state) # Dataset preparation X, y = generate_spirals( args.n_samples, noise_std=0, n_cycles=args.n_cycles, ) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.9, shuffle=True, stratify=y, ) X_test_t = torch.from_numpy(X_test).to(device, dtype) dataset_train = CustomDataset(X_train, y_train) dataloader_train = DataLoader( dataset_train, batch_size=2 * args.batch_size, drop_last=True, shuffle=True, ) # Model and loss definition model = MLPClassifierMixup( n_features=2, hidden_dims=tuple(args.hidden_dims), p=args.dropout_probability, ) model.to(device, dtype) optimizer = torch.optim.AdamW( model.parameters(), weight_decay=args.weight_decay, ) loss_fn = torch.nn.BCEWithLogitsLoss() # Summary writer = SummaryWriter(args.logpath) writer.add_text("hparams", json.dumps(vars(args))) # Training + evaluation loop bs = args.batch_size n_steps = 0 for e in range(args.n_epochs): for X_batch, y_batch in dataloader_train: X_batch, y_batch = X_batch.to(device, dtype), y_batch.to( device, dtype ) if args.mixup: k_min, k_max = args.mixing_layer k_min = k_min or 0 k_max = k_max or model.n_hidden + 1 k = np.random.randint(k_min, k_max) lam = np.random.beta(2, 2) writer.add_scalar("k", k, n_steps) writer.add_scalar("lambda", lam, n_steps) h = model(X_batch, start=0, end=k) # (2 * batch_size, *) h_mixed = lam * h[:bs] + (1 - lam) * h[bs:] # (batch_size, *) y_mixed = lam * y_batch[:bs] + (1 - lam) * y_batch[bs:] # (batch_size,) logits = model(h_mixed, start=k, end=None) # (batch_size, 1) loss = loss_fn(logits.squeeze(), y_mixed) else: logits = model(X_batch[:bs]) # (batch_size, 1) loss = loss_fn(logits.squeeze(), y_batch[:bs]) optimizer.zero_grad() loss.backward() optimizer.step() # Logging writer.add_scalar("loss_train", loss, n_steps) if n_steps % 2500 == 0: model.eval() fig_gen = generate_prediction_img( model, X_train, X_test, y_train, y_test, ) writer.add_figure("test", next(fig_gen)) writer.add_figure("contour", next(fig_gen), n_steps) writer.add_figure("contour_train", next(fig_gen), n_steps) with torch.no_grad(): logits_test = model(X_test_t).squeeze().detach().cpu() acc_test = ( torch.sigmoid(logits_test).round().numpy() == y_test ).sum() / len(y_test) loss_test = loss_fn(logits_test, torch.from_numpy(y_test)) writer.add_scalar("loss_test", loss_test, n_steps) writer.add_scalar("accuracy_test", acc_test, n_steps) model.train() n_steps += 1 if __name__ == "__main__": main() ================================================ FILE: github_adventures/mixup/utils.py ================================================ import matplotlib.pyplot as plt import numpy as np import torch import torch.nn as nn from matplotlib.colors import ListedColormap from torch.utils.data import Dataset class MLPClassifierMixup(nn.Module): """Multilayer perceptron with inbuilt mixup logic. Assuming binary classification. Parameters ---------- n_features : int Number of features. hidden_dims : tuple The sizes of the hidden layers. p : float Dropout probability. Attributes ---------- hidden_layers : nn.ModuleList List of hidden layers that are each composed of a `Linear`, `LeakyReLU` and `Dropout` modules. n_hidden : int Number of hidden layers. clf : nn.Linear The classifier at the end of the pipeline. """ def __init__(self, n_features, hidden_dims, p=0): super().__init__() dims = (n_features,) + hidden_dims self.n_hidden = len(hidden_dims) self.hidden_layers = nn.ModuleList( [ nn.Sequential( nn.Linear(dims[i], dims[i + 1]), nn.LeakyReLU(0.2), nn.Dropout(p), ) for i in range(self.n_hidden) ] ) self.clf = nn.Linear(dims[-1], 1) def forward(self, x, start=0, end=None): """Run forward pass. Parameters ---------- x : torch.Tensor Input of shape `(n_samples, dim)`. Note that the dim will depend on `start`. start : int The hidden layer where the forward pass starts (inclusive). We use a convention of `start=0` and `end=0` as a noop and the input tensor is returned. Useful for implementing input mixing. end : int or None The ending hidden layer (exclusive). If None, then always run until the last hidden layer and then we also apply the classifier. """ for module in self.hidden_layers[start:end]: x = module(x) if end is None: x = self.clf(x) return x class CustomDataset(Dataset): """Custom classification dataset assuming we have X and y loaded in memory. Parameters ---------- X : np.ndarray Features of shape `(n_samples, n_features)`. y : np.ndarray Targets of shape `(n_samples,)`. """ def __init__(self, X, y): if len(X) != len(y): raise ValueError("Inconsistent number of samples") classes = np.unique(y) if not np.array_equal(np.sort(classes), np.array([0, 1])): raise ValueError self.X = X self.y = y def __len__(self): """Compute the length of the dataset.""" return len(self.X) def __getitem__(self, ix): """Return a single sample.""" return self.X[ix], self.y[ix] def generate_spirals( n_samples, noise_std=0.05, n_cycles=2, random_state=None, ): """Generate two spirals dataset. Parameters ---------- n_samples : int Number of samples to generate. For simplicity, an even number is required. The targets (2 spirals) are perfectly balanced. noise_std : float Standard deviation of the noise added to the spirals. n_cycles : int Number of revolutions the spirals make. random_state : int or None Controls randomness. Returns ------- X : np.ndarray Features of shape `(n_samples, n_features)`. y : np.ndarray Targets of shape `(n_samples,)`. There are two classes 0 and 1 representing the two spirals. """ if n_samples % 2 != 0: raise ValueError("The number of samples needs to be even") n_samples_per_class = int(n_samples // 2) angle_1 = np.linspace(0, n_cycles * 2 * np.pi, n_samples_per_class) angle_2 = np.pi + angle_1 radius = np.linspace(0.2, 2, n_samples_per_class) x_1 = radius * np.cos(angle_1) y_1 = radius * np.sin(angle_1) x_2 = radius * np.cos(angle_2) y_2 = radius * np.sin(angle_2) X = np.concatenate( [ np.stack([x_1, y_1], axis=1), np.stack([x_2, y_2], axis=1), ], axis=0, ) y = np.zeros((n_samples,)) y[n_samples_per_class:] = 1.0 if random_state is not None: np.random.seed(random_state) new_ixs = np.random.permutation(n_samples) X = X[new_ixs] + np.random.normal( loc=0, scale=noise_std, size=(n_samples, 2) ) y = y[new_ixs] return X, y def generate_prediction_img( model, X_train, X_test, y_train, y_test, ): """Generate contour and scatter plots with predictions. Parameters ---------- model : MLPClassifierMixup Instance of a multilayer-perceptron. X_train, X_test : np.ndarray Trand and test features of shape `(n_samples, n_features)`. y_train, y_test : np.ndarray Train and test targets of shape `(n_samples,)`. Yields ------ matplotlib.Figure Different figures. """ device = next(model.parameters()).device dtype = next(model.parameters()).dtype cm = plt.cm.RdBu cm_bright = ListedColormap(["#FF0000", "#0000FF"]) delta = 0.5 xlim = (X_test[:, 0].min() - delta, X_test[:, 0].max() + delta) ylim = (X_test[:, 1].min() - delta, X_test[:, 1].max() + delta) n = 50 xx, yy = np.meshgrid( np.linspace(xlim[0], xlim[1], n), np.linspace(ylim[0], ylim[1], n), ) grid = np.stack([xx.ravel(), yy.ravel()], axis=1) with torch.no_grad(): logits = model(torch.from_numpy(grid).to(device, dtype)) probs = torch.sigmoid(logits)[:, 0].detach().cpu().numpy() probs = probs.reshape(xx.shape) fig, ax = plt.subplots(1, 1, dpi=170) ax.scatter( X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors="k" ) ax.set_title("Test data") yield fig ax.cla() ax.contourf(xx, yy, probs, cmap=cm, alpha=0.8) ax.set_title("Prediction contours") yield fig ax.scatter( X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k" ) ax.set_title("Train data + prediction contours") yield fig ================================================ FILE: github_adventures/ner_evaluation/README.md ================================================ * https://github.com/huggingface/evaluate/blob/af3c30561d840b83e54fc5f7150ea58046d6af69/metrics/seqeval/seqeval.py#L120 * https://github.com/chakki-works/seqeval/blob/cd01b5210eaa65e691c22320aba56f2be9e9fc43/seqeval/metrics/sequence_labeling.py#L1 ================================================ FILE: github_adventures/ner_evaluation/ours.py ================================================ import re import pandas as pd from sklearn.metrics import classification_report def check_valid(annots: list[str]) -> bool: allowed_pattern = re.compile(r"^(O$|B-.+$|I-.+$)") annots = ["O"] + annots n = len(annots) if any(allowed_pattern.match(annot) is None for annot in annots): return False for i in range(1, n): annot = annots[i] if annot.startswith("I-"): if annots[i - 1] == "O" or annots[i - 1][2:] != annot[2:]: return False return True def get_etypes(annots: list[str]) -> list[None | str]: return [annot[2:] if annot != "O" else None for annot in annots] def get_entities(annots: list[str]) -> list[dict[str, int | str]]: if not check_valid(annots): raise ValueError("Invalid input.") annots = ["O"] + annots + ["O"] etypes = get_etypes(annots) n = len(annots) start_patterns = { ("O", "B-"), # ["O", "B-LOC"] ("B-", "B-"), # ["B-PERSON", "B-LOC"] ("I-", "B-"), # ["B-LOC", "I-LOC", "B-PERSON"] } end_patterns = { ("I-", "O"), # ["B-LOC", "I-LOC", "O"] ("B-", "O"), # ["B-LOC", "O"] ("B-", "B-"), # ["B-PERSON", "B-LOC"] ("I-", "B-"), # ["B-LOC", "I-LOC", "B-PERSON"] } entities: list[dict[str, int | str]] = [] i = 1 start = None while i < n: prev, curr = annots[i - 1], annots[i] pattern = (prev[:2], curr[:2]) if pattern in end_patterns and start is not None: entities.append( { "start": start - 1, "end": i - 2, "etype": etypes[i - 1], } ) start = None if pattern in start_patterns: start = i i += 1 return entities def get_report(annots_true: list[str], annots_pred: list[str]) -> dict: if len(annots_true) != len(annots_pred): raise ValueError("Unequal lengths") entities_true = pd.DataFrame(get_entities(annots_true)) entities_pred = pd.DataFrame(get_entities(annots_pred)) entities_true = entities_true.rename(columns={"etype": "etype_true"}) entities_pred = entities_pred.rename(columns={"etype": "etype_pred"}) df_merge = entities_true.merge(entities_pred, on=["start", "end"], how="outer") df = df_merge.fillna("") labels = (set(df["etype_true"].tolist()) | set(df["etype_pred"].tolist())) - {""} report = classification_report( df["etype_true"], df["etype_pred"], output_dict=True, labels=list(labels), ) return report ================================================ FILE: github_adventures/ner_evaluation/test_ours.py ================================================ import pytest from seqeval.metrics import classification_report as cr from seqeval.scheme import IOB2 from ours import check_valid, get_entities, get_etypes, get_report @pytest.mark.parametrize( "inp,out", [ ([], True), (["NONSENSE", "O"], False), (["O", "O", "O"], True), (["B-"], False), (["O", "I-ORG", "O"], False), (["O", "B-ORG", "I-PERSON"], False), (["O", "B-ORG", "B-PERSON"], True), (["O", "SOMETHING", "B-PERSON"], False), (["O-", "O", "O"], False), (["B-A", "O", "B-T"], True), (["I-a", "B-a", "B-a", "I-a", "I-a", "O"], False), ], ) def test_check_valid(inp, out): assert check_valid(inp) == out @pytest.mark.parametrize( "inp,out", [ ([], []), (["O", "O", "O"], [None, None, None]), (["O", "B-ORG", "O"], [None, "ORG", None]), (["O", "B-ORG", "B-ORG"], [None, "ORG", "ORG"]), (["O", "B-PERSON", "I-PERSON"], [None, "PERSON", "PERSON"]), (["B-A", "O", "B-T"], ["A", None, "T"]), ], ) def test_get_etypes(inp, out): assert get_etypes(inp) == out @pytest.mark.parametrize( "inp,out", [ (["O", "O", "O"], []), (["O", "B-ORG", "O"], [{"start": 1, "end": 1, "etype": "ORG"}]), ( ["O", "B-ORG", "B-ORG"], [ {"start": 1, "end": 1, "etype": "ORG"}, {"start": 2, "end": 2, "etype": "ORG"}, ], ), (["O", "B-PERSON", "I-PERSON"], [{"start": 1, "end": 2, "etype": "PERSON"}]), ( ["B-A", "O", "B-T"], [ {"start": 0, "end": 0, "etype": "A"}, {"start": 2, "end": 2, "etype": "T"}, ], ), (["B-LOC", "I-LOC", "I-LOC"], [{"start": 0, "end": 2, "etype": "LOC"}]), ( ["B-A", "I-A", "B-T"], [ {"start": 0, "end": 1, "etype": "A"}, {"start": 2, "end": 2, "etype": "T"}, ], ), ], ) def test_get_entities(inp, out): assert get_entities(inp) == out @pytest.mark.parametrize( "annots_true,annots_pred", [ ( ["O", "B-PERSON", "I-PERSON", "O"], ["O", "B-PERSON", "I-PERSON", "O"], ), ( ["O", "B-PERSON", "I-PERSON", "B-LOC"], ["O", "B-PERSON", "I-PERSON", "O"], ), ( ["O", "B-PERSON", "I-PERSON", "O"], ["O", "O", "B-PERSON", "O"], ), ( ["O", "B-PERSON", "I-PERSON", "O"], ["O", "O", "B-PERSON", "O"], ), ( ["B-PERSON", "B-LOC", "I-LOC", "B-DATE"], ["B-PERSON", "B-DATE", "B-PERSON", "B-DATE"], ), ( ["B-PERSON", "I-PERSON", "I-PERSON", "O", "O", "B-LOC", "B-DATE"], ["B-PERSON", "I-PERSON", "I-PERSON", "O", "O", "B-LOC", "B-DATE"], ), ( ["B-PERSON", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-LOC"], ["B-PERSON", "O", "B-DATE", "O", "B-LOC", "I-LOC", "I-LOC", "I-LOC"], ), ( ["B-PERSON", "I-PERSON", "O", "B-LOC", "I-LOC", "O", "B-PERSON", "B-PERSON", "B-LOC"], ["B-PERSON", "I-PERSON", "O", "B-LOC", "B-LOC", "O", "B-PERSON", "B-PERSON", "B-LOC"], ), ] ) def test_get_report(annots_true, annots_pred): report = get_report(annots_true, annots_pred) seqeval_report = cr([annots_true], [annots_pred], scheme=IOB2, mode="strict", output_dict=True) keys_to_delete = {"accuracy", "micro avg"} for rep in (report, seqeval_report): for key in keys_to_delete: try: rep.pop(key) except KeyError: pass assert report == seqeval_report ================================================ FILE: github_adventures/ner_evaluation/try.py ================================================ import pprint import evaluate metric = evaluate.load("seqeval") # Tom Cruise is great annots_true = ["B-PERSON", "I-PERSON", "O", "O"] # annots_pred = ["B-PERSON", "I-PERSON", "O", "O"] # annots_pred = ["O", "O", "O", "O"] # annots_pred = ["B-PERSON", "O", "O", "O"] annots_pred = ["B-LOCATION", "I-LOCATION", "O", "O"] result = metric.compute(references=[annots_true], predictions=[annots_pred]) pprint.pprint(result) ================================================ FILE: github_adventures/neuron/README.md ================================================ # Installation ```bash pip install -r requirements.txt ``` # Running training To run the same experiments as in the video run ```bash ./launch.sh ``` However, feel free to check the contents of the `launch.sh` for single experiments. # Evaluation and pretrained models This repo contains multiple pretrained models inside of `pretrained/`. They are all `.pkl` files and they were created by pickling `solutions.Solution` subclasses. To load them inside of Python run something along these lines ```python import pickle solution_path = "pretrained/invariant_ours.pkl" # you can change this with open(solution_path, "rb") as f: solution = pickle.load(f)[0] ``` You can also run any of the below scripts to reproduce the results from the end of the video. ```bash EPISODES=30 python evaluate_shuffling.py -e $EPISODES python evaluate_noise.py -e $EPISODES python evaluate_video.py -e $EPISODES ``` ================================================ FILE: github_adventures/neuron/evaluate_noise.py ================================================ """Assumes you have already trained your model and you have a checkpoint.""" import argparse import pathlib import pickle import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from tasks import IncompatibleNFeatures, Task def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument( "-e", "--n-episodes", type=int, default=200, ) args = parser.parse_args(argv) # Prepare solutions and tasks checkpoint_path = pathlib.Path("pretrained") / "invariant_official.pkl" assert checkpoint_path.exists() with checkpoint_path.open("rb") as f: obj = pickle.load(f) if len(obj) == 1: solution_inst = obj[0] elif len(obj) == 2: solver, solution_inst = obj solution_inst.set_params(solver.result.xfavorite) else: raise ValueError results = [] for n_noise_features in range(0, 30, 5): for shuffle in [True, False]: print(f"{n_noise_features=}, {shuffle=}") task = Task( render=False, n_noise_features=n_noise_features, shuffle_on_reset=shuffle, env_seed=None, feature_seed=None, ) for episode_ix in range(args.n_episodes): reward = task.rollout(solution_inst) results.append( { "n_noise_features": n_noise_features, "shuffle": shuffle, "episode_ix": episode_ix, "reward": reward, } ) results_df = pd.DataFrame(results) fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=300) sns.violinplot( data=results_df, x="n_noise_features", y="reward", hue="shuffle", split=True, inner="quart", linewidth=1, palette="muted", ax=ax, scale="count", ) sns.despine(left=True) ax.set_ylim(0, 1000) ax.grid(True) fig.tight_layout() fig.savefig("invariant_model_noise.png") if __name__ == "__main__": main() ================================================ FILE: github_adventures/neuron/evaluate_shuffling.py ================================================ """Assumes you have already trained your model and you have a checkpoint.""" import argparse import pathlib import pickle import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from tasks import IncompatibleNFeatures, Task def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument( "-e", "--n-episodes", type=int, default=200, ) args = parser.parse_args(argv) # Prepare solutions and tasks checkpoints = {} checkpoint_folder = pathlib.Path("pretrained") assert checkpoint_folder.exists() checkpoint_paths = [ checkpoint_folder / "linear.pkl", checkpoint_folder / "linear_augment.pkl", checkpoint_folder / "MLP.pkl", checkpoint_folder / "MLP_augment.pkl", checkpoint_folder / "invariant_ours.pkl", checkpoint_folder / "invariant_official.pkl", ] for path in checkpoint_paths: with path.open("rb") as f: obj = pickle.load(f) if len(obj) == 1: solution_inst = obj[0] elif len(obj) == 2: solver, solution_inst = obj solution_inst.set_params(solver.result.xfavorite) else: raise ValueError checkpoints[path.stem] = solution_inst results = [] for model_name, solution_inst in checkpoints.items(): for shuffle in [True, False]: print(f"{model_name=}, {shuffle=}") task = Task( render=False, n_noise_features=0, shuffle_on_reset=shuffle, env_seed=None, feature_seed=None, ) for episode_ix in range(args.n_episodes): reward = task.rollout(solution_inst) results.append( { "model": model_name, "shuffle": shuffle, "episode_ix": episode_ix, "reward": reward, } ) results_df = pd.DataFrame(results) fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=300) sns.violinplot( data=results_df, x="model", y="reward", hue="shuffle", split=True, inner="quart", linewidth=1, palette="muted", ax=ax, scale="count", order=sorted(checkpoints.keys()), ) sns.despine(left=True) ax.set_ylim(0, 1000) ax.grid(True) fig.tight_layout() fig.savefig("all_models_shuffling.png") if __name__ == "__main__": main() ================================================ FILE: github_adventures/neuron/evaluate_video.py ================================================ """Assumes you have already trained your model and you have a checkpoint.""" import argparse import pathlib import pickle from gym.wrappers import Monitor import matplotlib.pyplot as plt from tasks import IncompatibleNFeatures, Task def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument( "-e", "--n-episodes", type=int, default=2, ) args = parser.parse_args(argv) # Prepare solutions and tasks checkpoints = {} checkpoint_folder = pathlib.Path("pretrained") assert checkpoint_folder.exists() checkpoint_paths = [ checkpoint_folder / "linear.pkl", checkpoint_folder / "linear_augment.pkl", checkpoint_folder / "MLP.pkl", checkpoint_folder / "MLP_augment.pkl", checkpoint_folder / "invariant_ours.pkl", checkpoint_folder / "invariant_official.pkl", ] checkpoint_paths = checkpoint_paths for path in checkpoint_paths: with path.open("rb") as f: obj = pickle.load(f) if len(obj) == 1: solution_inst = obj[0] elif len(obj) == 2: solver, solution_inst = obj solution_inst.set_params(solver.result.xfavorite) else: raise ValueError checkpoints[path.stem] = solution_inst for model_name, solution_inst in checkpoints.items(): for shuffle in [True, False]: for episode_ix in range(args.n_episodes): print(f"{model_name=}, {shuffle=}") task = Task( render=False, n_noise_features=0, shuffle_on_reset=shuffle, env_seed=None, feature_seed=None, ) task.env = Monitor( task.env, f"videos/{model_name}/{shuffle}/{episode_ix}/", ) reward = task.rollout(solution_inst) if __name__ == "__main__": main() ================================================ FILE: github_adventures/neuron/launch.sh ================================================ OUTPUT_FOLDER=log_dir python trainer.py --max-iter 1000 linear $OUTPUT_FOLDER/linear python trainer.py --max-iter 1000 --shuffle-on-reset linear $OUTPUT_FOLDER/linear_augment python trainer.py --max-iter 1000 MLP $OUTPUT_FOLDER/MLP python trainer.py --max-iter 2000 --shuffle-on-reset MLP $OUTPUT_FOLDER/MLP_augment python trainer.py --max-iter 14000 invariant $OUTPUT_FOLDER/invariant ================================================ FILE: github_adventures/neuron/requirements.txt ================================================ cma gym gym-cartpole-swingup matplotlib numpy pandas seaborn tensorboard torch tqdm ================================================ FILE: github_adventures/neuron/solutions.py ================================================ import abc import numpy as np import torch from torch_utils import PermutationInvariantNetwork, MLP class Solution(abc.ABC): """Solution abstract class. Attributes ---------- policy : torch.nn.Module Network that holds all the learnable parameters. """ @abc.abstractmethod def clone(self, obs): """Create a copy of the current solution without any links to self.""" @abc.abstractmethod def get_action(self, obs): """Determine the next action given the observation array.""" @abc.abstractmethod def get_n_features(self): """Get the number of features expected by the model. If None then the model can process variable-sized feature vectors. """ @abc.abstractmethod def reset(self): """Reset solution. Will be called at the beginning of each rollout. Does not mean we will "reinitialize" the weights of `policy`. """ def get_params(self): """Get learnable parameters of the solution. Returns ------- params : np.ndarray 1D array containing all parameters. """ params_l = [] for p in self.policy.parameters(): params_l.append(p.numpy().ravel()) params = np.concatenate(params_l) return params def set_params(self, params): """Set the learnable parameters. Parameters ---------- params : np.ndarray 1D array containing all parameters. Returns ------- self : Solution """ start_ix, end_ix = 0, 0 for p in self.policy.parameters(): end_ix = start_ix + np.prod(p.shape) p.data = torch.from_numpy( params[start_ix:end_ix].reshape(p.shape) ).float() start_ix = end_ix return self def get_n_params(self): return len(self.get_params()) class MLPSolution(Solution): """Multilayer perceptron solution. Parameters ---------- n_features : int Number of input features. hidden_layer_sizes : tuple Tuple of int that defines the sizes of all hidden layers. Attributes ---------- kwargs : dict All parameters necessary to instantiate the class. policy : MLP Policy network - multilayer perceptron. """ def __init__(self, n_features=5, hidden_layer_sizes=(16,)): self.kwargs = { "n_features": n_features, "hidden_layer_sizes": hidden_layer_sizes, } self.dtype = torch.float32 self.policy = MLP(n_features, hidden_layer_sizes) self.policy.to(self.dtype) self.policy.eval() def clone(self): old_policy = self.policy new_solution = self.__class__(**self.kwargs) new_solution.policy.load_state_dict( old_policy.state_dict(), ) return new_solution def get_action(self, obs): y = self.policy(torch.from_numpy(obs).to(self.dtype)) action = y.item() return action def get_n_features(self): return self.kwargs["n_features"] def reset(self): pass class PermutationInvariantSolution(Solution): """Permutation invariant solution. Parameters ---------- n_embeddings : int Number of rows in the Q tensor. proj_dim : int Size of the space to which we project the K and Q tensors. hidden_size : int Dimensionality of the Q and K tensors before linear projections. Attributes ---------- kwargs : dict All parameters necessary to instantiate the class dtype : torch.dtype Dtype of both the network weights and input features. policy : PermutationInvariantNetwork Policy network. prev_action : float Stores the previous action. Automatically updated each time we call `get_action`. """ def __init__( self, n_embeddings=16, proj_dim=32, hidden_size=8, ): self.kwargs = { "n_embeddings": n_embeddings, "proj_dim": proj_dim, "hidden_size": hidden_size, } self.policy = PermutationInvariantNetwork( n_embeddings=n_embeddings, proj_dim=proj_dim, hidden_size=hidden_size, ) self.dtype = torch.float32 self.policy.to(self.dtype) self.policy.eval() self.prev_action = 0 # will be continuously updated def clone(self): old_policy = self.policy new_solution = self.__class__(**self.kwargs) new_solution.policy.load_state_dict( old_policy.state_dict(), ) return new_solution def get_action(self, obs): y = self.policy(torch.from_numpy(obs).to(self.dtype), self.prev_action) action = y.item() self.prev_action = action return action def reset(self): self.policy.attention_neuron.hx = None self.previous_action = 0 def get_n_features(self): return None ================================================ FILE: github_adventures/neuron/tasks.py ================================================ import gym import gym_cartpole_swingup # noqa has a sideffect import numpy as np N_ORIGINAL_FEATURES = 5 class IncompatibleNFeatures(Exception): """Raised when observation and model number of features does not match.""" class Task: """Cartpoleswingup task. Parameters ---------- render : bool If True, we render each step into a video frame. shuffle_on_reset : bool If True, the features are randomly shuffled before each rollout. n_noise_features : int Number of noise features added to the observation vector. env_seed : None or int Random state controling the underlying `gym.Env`. feature_seed : None or int Random state controling the shuffling and noise features. max_episode_steps : int Maximum number of steps per episode (=rollout). After his number `done=True` automatically. Attributes ---------- n_features : int Overall number of features (original + noise). perm_ix : np.ndarray 1D array storing a permutation indices of the features. env : gym.Env Environment. rnd : RandomState Random state. """ def __init__( self, render=False, shuffle_on_reset=False, n_noise_features=0, env_seed=None, feature_seed=None, max_episode_steps=1000, ): self.env = gym.make("CartPoleSwingUp-v1") self.env._max_episode_steps = max_episode_steps self.shuffle_on_reset = shuffle_on_reset self.render = render self.n_noise_features = n_noise_features self.n_features = N_ORIGINAL_FEATURES + n_noise_features self.perm_ix = np.arange(self.n_features) self.noise_std = 0.1 # Set seeds self.env.seed(env_seed) self.rnd = np.random.RandomState(seed=feature_seed) def reset_for_rollout(self): """Generate a new permutation of the features. It is going to be called at the beginning of each episode. Note that the permutation stays constant throughout the episode. """ self.perm_ix = np.arange(self.n_features) if self.shuffle_on_reset: self.rnd.shuffle(self.perm_ix) def modify_obs(self, obs): """Modify raw observations. Parameters ---------- obs : np.ndarray Raw observation/feature array of shape `(5,)`. Returns ------- obs_modified : np.ndarray Modified observation array of shape `(5 + n_noise_features,)`. If `shuffle_on_reset` then the order of the features is going to change. """ noise = self.rnd.randn(self.n_noise_features) * self.noise_std obs_and_noise = np.concatenate([obs, noise], axis=0) obs_modified = obs_and_noise[self.perm_ix] return obs_modified def rollout(self, solution): """Run a single episode/rollout. Parameters ---------- solution : solutions.Solution Instance of a solution that yields an action given an observation. Returns ------- ep_reward : int Overall episode reward computed as a sum of per step rewards. """ # sanity check n_features_solution = solution.get_n_features() n_features_task = self.n_features if ( n_features_solution is not None and n_features_solution != n_features_task ): raise IncompatibleNFeatures self.reset_for_rollout() solution.reset() # important for PermutationInvariantSolution obs = self.env.reset() if self.render: self.env.render() ep_reward = 0 done = False while not done: obs_modified = self.modify_obs(obs) action = solution.get_action(obs_modified) obs, reward, done, _ = self.env.step(action) ep_reward += reward if self.render: self.env.render() return ep_reward ================================================ FILE: github_adventures/neuron/torch_utils.py ================================================ import numpy as np import torch import torch.nn as nn class MLP(nn.Module): """Multilayer perceptron policy network. Parameters ---------- n_features : int Number of input features. hidden_layer_sizes : tuple Tuple of int that defines the sizes of all hidden layers. Attributes ---------- net : nn.Sequential The actual network. """ def __init__(self, n_features, hidden_layer_sizes): super().__init__() layer_sizes = (n_features,) + hidden_layer_sizes + (1,) layers = [] for i in range(len(layer_sizes) - 1): in_features = layer_sizes[i] out_features = layer_sizes[i + 1] layers.extend( [ nn.Linear(in_features, out_features), nn.Tanh(), ] ) self.net = nn.Sequential(*layers) for p in self.parameters(): p.requires_grad = False def forward(self, obs): """Run forward pass. Parameters ---------- obs : torch.Tensor 1D tensor representing the input observation of shape `(n_features,)`. Returns ------- torch.Tensor Scalar between -1 and 1 representing the action. """ return self.net(obs[None, :])[0] def pos_table(n_embeddings, hidden_size): """Create a table of positional encodings. Parameters ---------- n_embeddings : int Number of rows of the table. hidden_size : int Number of columns of the table. Returns ------- tab : np.ndarray 2D array holding the positional encodings. """ def get_angle(x, h): return x / np.power(10000, 2 * (h // 2) / hidden_size) def get_angle_vec(x): return [get_angle(x, j) for j in range(hidden_size)] tab = np.array([get_angle_vec(i) for i in range(n_embeddings)]).astype( float ) tab[:, 0::2] = np.sin(tab[:, 0::2]) tab[:, 1::2] = np.cos(tab[:, 1::2]) return tab class AttentionMatrix(nn.Module): """Generates attention matrix using the key and query tensors. Parameters ---------- proj_dim : int Size of the space to which we project the K and Q tensors. hidden_size : int Dimensionality of the Q and K tensors before linear projections. scale : bool If True, then the attention matrix will be divided by `proj_dim ** (1 / 2)` elementwise. Attributes ---------- proj_q, proj_k : torch.nn.Linear Linear models projecting the Q and K tensors. scalar : float Number used for scaling the attention matrix elementwise. """ def __init__(self, hidden_size, proj_dim, scale=True): super().__init__() self.proj_q = nn.Linear( in_features=hidden_size, out_features=proj_dim, bias=False ) self.proj_k = nn.Linear( in_features=hidden_size, out_features=proj_dim, bias=False ) if scale: self.scalar = np.sqrt(proj_dim) else: self.scalar = 1 def forward(self, data_q, data_k): """Run the forward pass. Parameters ---------- data_q : torch.Tensor Query tensor of shape `(n_embeddings, hidden_size)`. data_k : torch.Tensor Key tensor of shape `(n_features, hidden_size)`. Returns ------- attention_weights : torch.Tensor Attention weights (don't sum up to 1 in general) of shape `(n_embeddings, n_features)`. """ q = self.proj_q(data_q) # (n_embeddings, proj_dim) k = self.proj_k(data_k) # (n_features, proj_dim) dot = q @ k.T # (n_embeddings, n_features) dot_scaled = torch.div(dot, self.scalar) # (n_embeddings, n_features) attention_weights = torch.tanh( dot_scaled ) # (n_embeddings, n_features) return attention_weights class AttentionNeuron(nn.Module): """Permutation invariant layer. Parameters ---------- n_embeddings : int Number of rows in the Q tensor. In our case it is equal to the length of the latent code `m`. proj_dim : int Size of the space to which we project the K and Q tensors. hidden_size : int The dimensionality of the Q and K tensors before linear projections. Attributes ---------- hx : tuple or None If not None then a tuple of 2 hidden state tensors (LSTM specific) lstm : nn.LSTMCell LSTM cell that inputs a hidden state and an observation and outputs a new hidden state. attention_matrix : AttentionMatrix Attention matrix (only needs Q and K tensors). Q : torch.Tensor Query tensor that is not learnable since it is populated with positional encodings. """ def __init__( self, n_embeddings=16, proj_dim=32, hidden_size=8, ): super().__init__() self.n_embeddings = n_embeddings self.proj_dim = proj_dim self.hidden_size = hidden_size # Modules self.hx = None self.lstm = nn.LSTMCell(input_size=2, hidden_size=hidden_size) self.attention_matrix = AttentionMatrix( hidden_size=hidden_size, proj_dim=proj_dim, scale=False, ) self.register_buffer( "Q", torch.from_numpy( pos_table( n_embeddings, hidden_size, ) ).float(), ) def forward(self, obs, prev_action): """Run forward pass. Parameters ---------- obs : torch.Tensor 1D tensor representing the input observations of shape `(n_features,)`. prev_action : float Number between -1 and 1 based on what the previous action was. Returns ------- latent_code : torch.Tensor 1D tensor representing the latent code of shape `(n_embeddings,)`. attn_weights : torch.Tensor 2D tensor of shape `(n_embeddings, n_features)` representing attention weights. """ n_features = len(obs) prev_action = float(prev_action) obs_and_act = torch.cat( [ obs[:, None], torch.ones(n_features, 1) * prev_action, ], dim=-1, ) # (n_features, 2) if self.hx is None: self.hx = ( torch.zeros(n_features, self.hidden_size), torch.zeros(n_features, self.hidden_size), ) self.hx = self.lstm( obs_and_act, self.hx ) # Tuple[(n_features, hidden_size)] data_q = self.Q # (n_embeddings, hidden_size) data_k = self.hx[0] # (n_features, hidden_size) data_v = obs[:, None] # (n_features, 1) attn_weights = self.attention_matrix( data_q=data_q, data_k=data_k ) # (n_embeddings, n_features) latent_code_ = torch.tanh(attn_weights @ data_v) # (n_embeddings, 1) latent_code = latent_code_.squeeze() # (n_embeddings,) return latent_code, attn_weights class PermutationInvariantNetwork(nn.Module): """Permutation invariant policy network. Parameters ---------- n_embeddings : int Number of rows in the Q tensor. proj_dim : int Size of the space to which we project the K and Q tensors. hidden_size : int Dimensionality of the Q and K matrices before linear projections. Attributes ---------- attention_neuron : AttentionNeuron Permutation invariant layer that generates latent codes. linear : nn.Linear Maps the latent code into a single number. """ def __init__( self, n_embeddings=16, proj_dim=32, hidden_size=8, ): super().__init__() self.attention_neuron = AttentionNeuron( n_embeddings=n_embeddings, proj_dim=proj_dim, hidden_size=hidden_size, ) self.linear = nn.Linear(n_embeddings, 1) for p in self.parameters(): p.requires_grad = False def forward(self, obs, prev_action): """Run forward pass. Parameters ---------- obs : torch.Tensor 1D tensor representing the input observations of shape `(n_features,)`. prev_action : float Number between -1 and 1 based on what the previous action was. Returns ------- y : torch.Tensor Scalar tensor with a value in range (-1, 1) representing the next action. """ latent_code, _ = self.attention_neuron( obs, prev_action ) # (n_embeddings,) y_ = torch.tanh(self.linear(latent_code[None, :])) # (1, 1) y = y_[0] # (1,) return y ================================================ FILE: github_adventures/neuron/trainer.py ================================================ import argparse import json import multiprocessing as mp import pathlib import pickle from functools import partial import cma import numpy as np import tqdm from torch.utils.tensorboard import SummaryWriter from solutions import ( MLPSolution, PermutationInvariantSolution, ) from tasks import Task, N_ORIGINAL_FEATURES def save(folder, n_iter, solver, solution_inst): """Save checkpoint. Parameters ---------- folder : str Output folder. n_iter : int Iteration that corresponds to the checkpoint. solver : cma.CMAEvolutionStrategy Solver instance. solution_inst : Solution Solution instance. """ folder = pathlib.Path(folder) folder.mkdir(parents=True, exist_ok=True) path = folder / f"{n_iter}.pkl" with path.open("wb") as f: obj = (solver, solution_inst) pickle.dump(obj, f) def get_fitness( solution_inst, *, shuffle_on_reset, n_episodes, n_noise_features, env_seed, feature_seed, ): """Get fitness function used by the CMA optimizer/solver. Can be run independently on a single worker. Returns ------- fitness : list List of floats of length `n_episodes` holding the per episode reward. """ task = Task( render=False, shuffle_on_reset=shuffle_on_reset, n_noise_features=n_noise_features, env_seed=env_seed, feature_seed=feature_seed, ) fitness = [task.rollout(solution_inst) for _ in range(n_episodes)] return fitness def main(argv=None): parser = argparse.ArgumentParser( "Training", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "solution", type=str, choices=( "linear", "MLP", "invariant", ), ) parser.add_argument( "log_dir", type=str, help="Logging folder", ) parser.add_argument( "--checkpoint", type=str, help="Pickled solver and solution", ) parser.add_argument( "--env-seed", type=int, ) parser.add_argument( "--eval-frequency", type=int, default=25, ) parser.add_argument( "--feature-seed", type=int, ) parser.add_argument( "-m", "--max-iter", type=int, default=10000, help="Maximum number of iterations", ) parser.add_argument( "-e", "--n-episodes", type=int, default=16, help="Number of rollouts for fitness evaluation", ) parser.add_argument( "-j", "--n-jobs", type=int, default=-1, help="Number of processes", ) parser.add_argument( "-n", "--n-noise-features", type=int, default=0, help="Number of noise features", ) parser.add_argument( "-p", "--population-size", type=int, default=256, help="Number of solutions per generation", ) parser.add_argument( "-s", "--shuffle-on-reset", action="store_true", help="Shuffle features before each rollout", ) args = parser.parse_args(argv) writer = SummaryWriter(args.log_dir) writer.add_text("parameters", json.dumps(vars(args))) # Solution map if args.solution == "linear": solution_inst = MLPSolution( n_features=N_ORIGINAL_FEATURES + args.n_noise_features, hidden_layer_sizes=tuple(), ) elif args.solution == "MLP": solution_inst = MLPSolution( n_features=N_ORIGINAL_FEATURES + args.n_noise_features, hidden_layer_sizes=(16,), ) elif args.solution == "invariant": solution_inst = PermutationInvariantSolution( n_embeddings=16, proj_dim=32, hidden_size=8, ) else: raise ValueError # Prepare solver if args.checkpoint is None: x0 = np.zeros(solution_inst.get_n_params()) solver = cma.CMAEvolutionStrategy( x0=x0, sigma0=0.1, inopts={ "popsize": args.population_size, "seed": 42, "randn": np.random.randn, }, ) else: with open(args.checkpoint, "rb") as f: solver, solution_inst_ = pickle.load(f) assert isinstance(solution_inst, solution_inst_.__class__) solution_inst = solution_inst_ get_fitness_partial = partial( get_fitness, n_episodes=args.n_episodes, shuffle_on_reset=args.shuffle_on_reset, n_noise_features=args.n_noise_features, env_seed=args.env_seed, feature_seed=args.feature_seed, ) if args.n_jobs == -1: n_jobs = mp.cpu_count() else: n_jobs = args.n_jobs with mp.Pool(processes=n_jobs) as pool: for n_iter in tqdm.tqdm(range(args.max_iter)): try: params_set = solver.ask() iterable = [ solution_inst.clone().set_params(p) for p in params_set ] rewards = pool.map(get_fitness_partial, iterable) pos_fitnesses = [np.mean(r) for r in rewards] neg_fitnesses = [-x for x in pos_fitnesses] all_parameters = np.concatenate(params_set) metrics = { "parameter_mean": all_parameters.mean(), "parameter_std": all_parameters.std(), "mean": np.mean(pos_fitnesses), "max (generation)": np.max(pos_fitnesses), "max (overall)": -solver.result.fbest, } for metric_name, metric in metrics.items(): writer.add_scalar(metric_name, metric, global_step=n_iter) if (n_iter % args.eval_frequency == 0) or ( n_iter == (args.max_iter - 1) ): save(args.log_dir, n_iter, solver, solution_inst) solver.tell(params_set, neg_fitnesses) except KeyboardInterrupt: save( args.log_dir, n_iter, solver, solution_inst, ) break if __name__ == "__main__": main() ================================================ FILE: github_adventures/pondernet/experiment_1.sh ================================================ set -x SEED=$RANDOM LAMBDAS=(0.1 0.3 0.5 0.7 0.9) for lambda in ${LAMBDAS[@]} do python train.py \ --batch-size 128 \ --beta 0.01 \ --device cuda \ --eval-frequency 4000 \ --n-iter 100000 \ --n-hidden 128 \ --lambda-p $lambda \ --n-elems 15 \ results/experiment_a/$SEED/lambda_$lambda done ================================================ FILE: github_adventures/pondernet/experiment_2.sh ================================================ set -x SEED=$RANDOM python train.py \ --batch-size 128 \ --beta 0.01 \ --eval-frequency 4000 \ --device cuda \ --lambda-p 0.2 \ --n-elems 30 \ --n-iter 1500000 \ --n-hidden 128 \ --n-nonzero 1 25 \ results/experiment_b/$SEED ================================================ FILE: github_adventures/pondernet/requirements.txt ================================================ matplotlib numpy tensorboard torch tqdm ================================================ FILE: github_adventures/pondernet/train.py ================================================ from argparse import ArgumentParser import json import pathlib import matplotlib.pyplot as plt import torch import torch.nn as nn from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from utils import ( ParityDataset, PonderNet, ReconstructionLoss, RegularizationLoss, ) @torch.no_grad() def evaluate(dataloader, module): """Compute relevant metrics. Parameters ---------- dataloader : DataLoader Dataloader that yields batches of `x` and `y`. module : PonderNet Our pondering network. Returns ------- metrics_single : dict Scalar metrics. The keys are names and the values are `torch.Tensor`. These metrics are computed as mean values over the entire dataset. metrics_per_step : dict Per step metrics. The keys are names and the values are `torch.Tensor` of shape `(max_steps,)`. These metrics are computed as mean values over the entire dataset. """ # Imply device and dtype param = next(module.parameters()) device, dtype = param.device, param.dtype metrics_single_ = { "accuracy_halted": [], "halting_step": [], } metrics_per_step_ = { "accuracy": [], "p": [], } for x_batch, y_true_batch in dataloader: x_batch = x_batch.to(device, dtype) # (batch_size, n_elems) y_true_batch = y_true_batch.to(device, dtype) # (batch_size,) y_pred_batch, p, halting_step = module(x_batch) y_halted_batch = y_pred_batch.gather( dim=0, index=halting_step[None, :] - 1, )[ 0 ] # (batch_size,) # Computing single metrics (mean over samples in the batch) accuracy_halted = ( ((y_halted_batch > 0) == y_true_batch).to(torch.float32).mean() ) metrics_single_["accuracy_halted"].append(accuracy_halted) metrics_single_["halting_step"].append( halting_step.to(torch.float).mean() ) # Computing per step metrics (mean over samples in the batch) accuracy = ( ((y_pred_batch > 0) == y_true_batch[None, :]) .to(torch.float32) .mean(dim=1) ) metrics_per_step_["accuracy"].append(accuracy) metrics_per_step_["p"].append(p.mean(dim=1)) metrics_single = { name: torch.stack(values).mean(dim=0).cpu().numpy() for name, values in metrics_single_.items() } metrics_per_step = { name: torch.stack(values).mean(dim=0).cpu().numpy() for name, values in metrics_per_step_.items() } return metrics_single, metrics_per_step def plot_distributions(target, predicted): """Create a barplot. Parameters ---------- target, predicted : np.ndarray Arrays of shape `(max_steps,)` representing the target and predicted probability distributions. Returns ------- matplotlib.Figure """ support = list(range(1, len(target) + 1)) fig, ax = plt.subplots(dpi=140) ax.bar( support, target, color="red", label=f"Target - Geometric({target[0].item():.2f})", ) ax.bar( support, predicted, color="green", width=0.4, label="Predicted", ) ax.set_ylim(0, 0.6) ax.set_xticks(support) ax.legend() ax.grid() return fig def plot_accuracy(accuracy): """Create a barplot representing accuracy over different halting steps. Parameters ---------- accuracy : np.array 1D array representing accuracy if we were to take the output after the corresponding step. Returns ------- matplotlib.Figure """ support = list(range(1, len(accuracy) + 1)) fig, ax = plt.subplots(dpi=140) ax.bar( support, accuracy, label="Accuracy over different steps", ) ax.set_ylim(0, 1) ax.set_xticks(support) ax.legend() ax.grid() return fig def main(argv=None): """CLI for training.""" parser = ArgumentParser() parser.add_argument( "log_folder", type=str, help="Folder where tensorboard logging is saved", ) parser.add_argument( "--batch-size", type=int, default=128, help="Batch size", ) parser.add_argument( "--beta", type=float, default=0.01, help="Regularization loss coefficient", ) parser.add_argument( "-d", "--device", type=str, choices={"cpu", "cuda"}, default="cpu", help="Device to use", ) parser.add_argument( "--eval-frequency", type=int, default=10_000, help="Evaluation is run every `eval_frequency` steps", ) parser.add_argument( "--lambda-p", type=float, default=0.4, help="True probability of success for a geometric distribution", ) parser.add_argument( "--n-iter", type=int, default=1_000_000, help="Number of gradient steps", ) parser.add_argument( "--n-elems", type=int, default=64, help="Number of elements", ) parser.add_argument( "--n-hidden", type=int, default=64, help="Number of hidden elements in the reccurent cell", ) parser.add_argument( "--n-nonzero", type=int, nargs=2, default=(None, None), help="Lower and upper bound on nonzero elements in the training set", ) parser.add_argument( "--max-steps", type=int, default=20, help="Maximum number of pondering steps", ) # Parameters args = parser.parse_args(argv) print(args) device = torch.device(args.device) dtype = torch.float32 n_eval_samples = 1000 batch_size_eval = 50 if args.n_nonzero[0] is None and args.n_nonzero[1] is None: threshold = int(0.3 * args.n_elems) range_nonzero_easy = (1, threshold) range_nonzero_hard = (args.n_elems - threshold, args.n_elems) else: range_nonzero_easy = (1, args.n_nonzero[1]) range_nonzero_hard = (args.n_nonzero[1] + 1, args.n_elems) # Tensorboard log_folder = pathlib.Path(args.log_folder) writer = SummaryWriter(log_folder) writer.add_text("parameters", json.dumps(vars(args))) # Prepare data dataloader_train = DataLoader( ParityDataset( n_samples=args.batch_size * args.n_iter, n_elems=args.n_elems, n_nonzero_min=args.n_nonzero[0], n_nonzero_max=args.n_nonzero[1], ), batch_size=args.batch_size, ) # consider specifying `num_workers` for speedups eval_dataloaders = { "test": DataLoader( ParityDataset( n_samples=n_eval_samples, n_elems=args.n_elems, n_nonzero_min=args.n_nonzero[0], n_nonzero_max=args.n_nonzero[1], ), batch_size=batch_size_eval, ), f"{range_nonzero_easy[0]}_{range_nonzero_easy[1]}": DataLoader( ParityDataset( n_samples=n_eval_samples, n_elems=args.n_elems, n_nonzero_min=range_nonzero_easy[0], n_nonzero_max=range_nonzero_easy[1], ), batch_size=batch_size_eval, ), f"{range_nonzero_hard[0]}_{range_nonzero_hard[1]}": DataLoader( ParityDataset( n_samples=n_eval_samples, n_elems=args.n_elems, n_nonzero_min=range_nonzero_hard[0], n_nonzero_max=range_nonzero_hard[1], ), batch_size=batch_size_eval, ), } # Model preparation module = PonderNet( n_elems=args.n_elems, n_hidden=args.n_hidden, max_steps=args.max_steps, ) module = module.to(device, dtype) # Loss preparation loss_rec_inst = ReconstructionLoss( nn.BCEWithLogitsLoss(reduction="none") ).to(device, dtype) loss_reg_inst = RegularizationLoss( lambda_p=args.lambda_p, max_steps=args.max_steps, ).to(device, dtype) # Optimizer optimizer = torch.optim.Adam( module.parameters(), lr=0.0003, ) # Training and evaluation loops iterator = tqdm(enumerate(dataloader_train), total=args.n_iter) for step, (x_batch, y_true_batch) in iterator: x_batch = x_batch.to(device, dtype) y_true_batch = y_true_batch.to(device, dtype) y_pred_batch, p, halting_step = module(x_batch) loss_rec = loss_rec_inst( p, y_pred_batch, y_true_batch, ) loss_reg = loss_reg_inst( p, ) loss_overall = loss_rec + args.beta * loss_reg optimizer.zero_grad() loss_overall.backward() torch.nn.utils.clip_grad_norm_(module.parameters(), 1) optimizer.step() # Logging writer.add_scalar("loss_rec", loss_rec, step) writer.add_scalar("loss_reg", loss_reg, step) writer.add_scalar("loss_overall", loss_overall, step) # Evaluation if step % args.eval_frequency == 0: module.eval() for dataloader_name, dataloader in eval_dataloaders.items(): metrics_single, metrics_per_step = evaluate( dataloader, module, ) fig_dist = plot_distributions( loss_reg_inst.p_g.cpu().numpy(), metrics_per_step["p"], ) writer.add_figure( f"distributions/{dataloader_name}", fig_dist, step ) fig_acc = plot_accuracy(metrics_per_step["accuracy"]) writer.add_figure( f"accuracy_per_step/{dataloader_name}", fig_acc, step ) for metric_name, metric_value in metrics_single.items(): writer.add_scalar( f"{metric_name}/{dataloader_name}", metric_value, step, ) torch.save(module, log_folder / "checkpoint.pth") module.train() if __name__ == "__main__": main() ================================================ FILE: github_adventures/pondernet/utils.py ================================================ import torch import torch.nn as nn from torch.utils.data import Dataset class ParityDataset(Dataset): """Parity of vectors - binary classification dataset. Parameters ---------- n_samples : int Number of samples to generate. n_elems : int Size of the vectors. n_nonzero_min, n_nonzero_max : int or None Minimum (inclusive) and maximum (inclusive) number of nonzero elements in the feature vector. If not specified then `(1, n_elem)`. """ def __init__( self, n_samples, n_elems, n_nonzero_min=None, n_nonzero_max=None, ): self.n_samples = n_samples self.n_elems = n_elems self.n_nonzero_min = 1 if n_nonzero_min is None else n_nonzero_min self.n_nonzero_max = ( n_elems if n_nonzero_max is None else n_nonzero_max ) assert 0 <= self.n_nonzero_min <= self.n_nonzero_max <= n_elems def __len__(self): """Get the number of samples.""" return self.n_samples def __getitem__(self, idx): """Get a feature vector and it's parity (target). Note that the generating process is random. """ x = torch.zeros((self.n_elems,)) n_non_zero = torch.randint( self.n_nonzero_min, self.n_nonzero_max + 1, (1,) ).item() x[:n_non_zero] = torch.randint(0, 2, (n_non_zero,)) * 2 - 1 x = x[torch.randperm(self.n_elems)] y = (x == 1.0).sum() % 2 return x, y class PonderNet(nn.Module): """Network that ponders. Parameters ---------- n_elems : int Number of features in the vector. n_hidden : int Hidden layer size of the recurrent cell. max_steps : int Maximum number of steps the network can "ponder" for. allow_halting : bool If True, then the forward pass is allowed to halt before reaching the maximum steps. Attributes ---------- cell : nn.GRUCell Learnable GRU cell that maps the previous hidden state and the input to a new hidden state. output_layer : nn.Linear Linear module that serves as the binary classifier. It inputs the hidden state. lambda_layer : nn.Linear Linear module that generates the halting probability at each step. """ def __init__( self, n_elems, n_hidden=64, max_steps=20, allow_halting=False ): super().__init__() self.max_steps = max_steps self.n_hidden = n_hidden self.allow_halting = allow_halting self.cell = nn.GRUCell(n_elems, n_hidden) self.output_layer = nn.Linear(n_hidden, 1) self.lambda_layer = nn.Linear(n_hidden, 1) def forward(self, x): """Run forward pass. Parameters ---------- x : torch.Tensor Batch of input features of shape `(batch_size, n_elems)`. Returns ------- y : torch.Tensor Tensor of shape `(max_steps, batch_size)` representing the predictions for each step and each sample. In case `allow_halting=True` then the shape is `(steps, batch_size)` where `1 <= steps <= max_steps`. p : torch.Tensor Tensor of shape `(max_steps, batch_size)` representing the halting probabilities. Sums over rows (fixing a sample) are 1. In case `allow_halting=True` then the shape is `(steps, batch_size)` where `1 <= steps <= max_steps`. halting_step : torch.Tensor An integer for each sample in the batch that corresponds to the step when it was halted. The shape is `(batch_size,)`. The minimal value is 1 because we always run at least one step. """ batch_size, _ = x.shape device = x.device h = x.new_zeros(batch_size, self.n_hidden) un_halted_prob = x.new_ones(batch_size) y_list = [] p_list = [] halting_step = torch.zeros( batch_size, dtype=torch.long, device=device, ) for n in range(1, self.max_steps + 1): if n == self.max_steps: lambda_n = x.new_ones(batch_size) # (batch_size,) else: lambda_n = torch.sigmoid(self.lambda_layer(h))[ :, 0 ] # (batch_size,) # Store releavant outputs y_list.append(self.output_layer(h)[:, 0]) # (batch_size,) p_list.append(un_halted_prob * lambda_n) # (batch_size,) halting_step = torch.maximum( n * (halting_step == 0) * torch.bernoulli(lambda_n).to(torch.long), halting_step, ) # Prepare for next iteration un_halted_prob = un_halted_prob * (1 - lambda_n) h = self.cell(x, h) # Potentially stop if all samples halted if self.allow_halting and (halting_step > 0).sum() == batch_size: break y = torch.stack(y_list) p = torch.stack(p_list) return y, p, halting_step class ReconstructionLoss(nn.Module): """Weighted average of per step losses. Parameters ---------- loss_func : callable Loss function that accepts `y_pred` and `y_true` as arguments. Both of these tensors have shape `(batch_size,)`. It outputs a loss for each sample in the batch. """ def __init__(self, loss_func): super().__init__() self.loss_func = loss_func def forward(self, p, y_pred, y_true): """Compute loss. Parameters ---------- p : torch.Tensor Probability of halting of shape `(max_steps, batch_size)`. y_pred : torch.Tensor Predicted outputs of shape `(max_steps, batch_size)`. y_true : torch.Tensor True targets of shape `(batch_size,)`. Returns ------- loss : torch.Tensor Scalar representing the reconstruction loss. It is nothing else than a weighted sum of per step losses. """ max_steps, _ = p.shape total_loss = p.new_tensor(0.0) for n in range(max_steps): loss_per_sample = p[n] * self.loss_func( y_pred[n], y_true ) # (batch_size,) total_loss = total_loss + loss_per_sample.mean() # (1,) return total_loss class RegularizationLoss(nn.Module): """Enforce halting distribution to ressemble the geometric distribution. Parameters ---------- lambda_p : float The single parameter determining uniquely the geometric distribution. Note that the expected value of this distribution is going to be `1 / lambda_p`. max_steps : int Maximum number of pondering steps. """ def __init__(self, lambda_p, max_steps=20): super().__init__() p_g = torch.zeros((max_steps,)) not_halted = 1.0 for k in range(max_steps): p_g[k] = not_halted * lambda_p not_halted = not_halted * (1 - lambda_p) self.register_buffer("p_g", p_g) self.kl_div = nn.KLDivLoss(reduction="batchmean") def forward(self, p): """Compute loss. Parameters ---------- p : torch.Tensor Probability of halting of shape `(steps, batch_size)`. Returns ------- loss : torch.Tensor Scalar representing the regularization loss. """ steps, batch_size = p.shape p = p.transpose(0, 1) # (batch_size, max_steps) p_g_batch = self.p_g[None, :steps].expand_as( p ) # (batch_size, max_steps) return self.kl_div(p.log(), p_g_batch) ================================================ FILE: github_adventures/product_quantization/README.md ================================================ # Installation Run the following to get all the dependencies. ``` pip install -r requirements.txt ``` # Faiss 101 The code for the short intro to FAISS can be found in `faiss_101_ipython.py`. Note that you can use `parse.py` to turn the raw fasttext embeddings into a numpy array. See `run_all.sh` for example usage. # Custom PQ implementation The custom PQ implementation can be found inside of `custom.py`. # End to end script The script `run_all.sh` does the following things: * Download fasttext embeddings * Train multiple indexes (faiss + custom) using the embeddings * Serve gradio apps for similarity search comparing different indexes ``` chmod +x run_all.sh ./run_all ``` Don't forget to kill the Gradio processes by `pkill -f gradio` once you don't need them anymore. ================================================ FILE: github_adventures/product_quantization/convert.py ================================================ import argparse import logging import pathlib import pickle import faiss from custom import CustomIndexPQ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) def from_faiss(faiss_index: faiss.swigfaiss.IndexPQ) -> CustomIndexPQ: if not faiss_index.is_trained: raise ValueError("The faiss index is not trained") if faiss_index.ntotal == 0: raise ValueError("The faiss index has no codes") d = faiss_index.d m = faiss_index.code_size nbits = faiss_index.pq.nbits k = 2**nbits ntotal = faiss_index.ntotal custom_index = CustomIndexPQ(d=d, m=m, nbits=nbits) centers = faiss.vector_to_array(faiss_index.pq.centroids).reshape( m, k, d // m ) logger.info("Copying centers from the faiss index") for i in range(m): custom_index.estimators[i].cluster_centers_ = centers[i] custom_index.is_trained = True logger.info("Copying codes form the faiss index") custom_index.codes = faiss.vector_to_array(faiss_index.codes).reshape( ntotal, m ) return custom_index def main() -> int: parser = argparse.ArgumentParser("Convert from faiss to custom") parser.add_argument( "faiss_index_path", type=pathlib.Path, help="Path to a faiss index", ) parser.add_argument( "output_index_path", type=pathlib.Path, help="Path to a new custom index with faiss parameters", ) args = parser.parse_args() faiss_index = faiss.read_index(str(args.faiss_index_path)) custom_index = from_faiss(faiss_index) with args.output_index_path.open("wb") as f: pickle.dump(custom_index, f) if __name__ == "__main__": main() ================================================ FILE: github_adventures/product_quantization/custom.py ================================================ from __future__ import annotations import logging import numpy as np from sklearn.cluster import KMeans from sklearn.metrics.pairwise import euclidean_distances logger = logging.getLogger(__name__) BITS2DTYPE = { 8: np.uint8, } class CustomIndexPQ: """Custom IndexPQ implementation. Parameters ---------- d Dimensionality of the original vectors. m Number of segments. nbits Number of bits. estimator_kwargs Additional hyperparameters passed onto the sklearn KMeans class. """ def __init__( self, d: int, m: int, nbits: int, **estimator_kwargs: str | int, ) -> None: if d % m != 0: raise ValueError("d needs to be a multiple of m") if nbits not in BITS2DTYPE: raise ValueError(f"Unsupported number of bits {nbits}") self.m = m self.k = 2**nbits self.d = d self.ds = d // m self.estimators = [ KMeans(n_clusters=self.k, **estimator_kwargs) for _ in range(m) ] logger.info(f"Creating following estimators: {self.estimators[0]!r}") self.is_trained = False self.dtype = BITS2DTYPE[nbits] self.dtype_orig = np.float32 self.codes: np.ndarray | None = None def train(self, X: np.ndarray) -> None: """Train all KMeans estimators. Parameters ---------- X Array of shape `(n, d)` and dtype `float32`. """ if self.is_trained: raise ValueError("Training multiple times is not allowed") for i in range(self.m): estimator = self.estimators[i] X_i = X[:, i * self.ds : (i + 1) * self.ds] logger.info(f"Fitting KMeans for the {i}-th segment") estimator.fit(X_i) self.is_trained = True def encode(self, X: np.ndarray) -> np.ndarray: """Encode original features into codes. Parameters ---------- X Array of shape `(n_queries, d)` of dtype `np.float32`. Returns ------- result Array of shape `(n_queries, m)` of dtype `np.uint8`. """ n = len(X) result = np.empty((n, self.m), dtype=self.dtype) for i in range(self.m): estimator = self.estimators[i] X_i = X[:, i * self.ds : (i + 1) * self.ds] result[:, i] = estimator.predict(X_i) return result def add(self, X: np.ndarray) -> None: """Add vectors to the database (their encoded versions). Parameters ---------- X Array of shape `(n_codes, d)` of dtype `np.float32`. """ if not self.is_trained: raise ValueError("The quantizer needs to be trained first.") self.codes = self.encode(X) def compute_asymmetric_distances(self, X: np.ndarray) -> np.ndarray: """Compute asymmetric distances to all database codes. Parameters ---------- X Array of shape `(n_queries, d)` of dtype `np.float32`. Returns ------- distances Array of shape `(n_queries, n_codes)` of dtype `np.float32`. """ if not self.is_trained: raise ValueError("The quantizer needs to be trained first.") if self.codes is None: raise ValueError("No codes detected. You need to run `add` first") n_queries = len(X) n_codes = len(self.codes) distance_table = np.empty( (n_queries, self.m, self.k), dtype=self.dtype_orig ) # (n_queries, m, k) for i in range(self.m): X_i = X[:, i * self.ds : (i + 1) * self.ds] # (n_queries, ds) centers = self.estimators[i].cluster_centers_ # (k, ds) distance_table[:, i, :] = euclidean_distances( X_i, centers, squared=True ) distances = np.zeros((n_queries, n_codes), dtype=self.dtype_orig) for i in range(self.m): distances += distance_table[:, i, self.codes[:, i]] return distances def search(self, X: np.ndarray, k: int) -> tuple[np.ndarray, np.ndarray]: """Find k closest database codes to given queries. Parameters ---------- X Array of shape `(n_queries, d)` of dtype `np.float32`. k The number of closest codes to look for. Returns ------- distances Array of shape `(n_queries, k)`. indices Array of shape `(n_queries, k)`. """ n_queries = len(X) distances_all = self.compute_asymmetric_distances(X) indices = np.argsort(distances_all, axis=1)[:, :k] distances = np.empty((n_queries, k), dtype=np.float32) for i in range(n_queries): distances[i] = distances_all[i][indices[i]] return distances, indices ================================================ FILE: github_adventures/product_quantization/faiss_101_ipython.py ================================================ import numpy as np import faiss # Load fast text embeddings embs = np.load("parsed_fasttext/embs.npy") # change path if necessary embs.shape embs.nbytes / 1e6 # Prepare parameters d = embs.shape[1] m = 10 nbits = 8 k = 2 ** nbits k # Construct index index = faiss.IndexPQ(d, m, nbits) index.is_trained # Try encoding without any training index.sa_encode(embs[:2]) # Train the model index.train(embs) index.is_trained index.ntotal # Add vectors to the database index.add(embs) index.ntotal codes = faiss.vector_to_array(index.codes).reshape(index.ntotal, m) codes[:3] codes.nbytes / 1e6 # Try searching - EXHAUSTIVE SEARCH index.search(embs[:3], 4) # Quickly show that with flat index distances are precise flat_index = faiss.IndexFlatL2(d) flat_index.train(embs) flat_index.add(embs) flat_index.search(embs[:3], 4) ================================================ FILE: github_adventures/product_quantization/generate_index.py ================================================ from __future__ import annotations import argparse import logging import pathlib import pickle import faiss import numpy as np from custom import CustomIndexPQ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "input_path", type=pathlib.Path, help="Path to the full embeddings array", ) parser.add_argument( "index_type", type=str, choices=["faiss-flat", "faiss-pq", "our-pq"], help="Type of index to generate", ) parser.add_argument( "output_path", type=pathlib.Path, help="Path to where to store the index" ) args, unknown_kwargs = parser.parse_known_args() hyperparams: dict[str, int] = {} for i in range(0, len(unknown_kwargs), 2): key_raw, value_raw = unknown_kwargs[i], unknown_kwargs[i + 1] key = key_raw.strip("--") value = int(value_raw) if value_raw.isnumeric() else value_raw hyperparams[key] = value logger.info(f"The following hyperparameters were detected {hyperparams}") logger.info("Loading embeddings") embs = np.load(args.input_path) n, d = embs.shape if args.index_type == "faiss-flat": logger.info("Instantiating IndexFlatL2") index = faiss.IndexFlatL2(d) elif args.index_type == "faiss-pq": logger.info("Instantiating IndexPQ") arguments = [d, hyperparams["m"], hyperparams["nbits"]] index = faiss.IndexPQ(*arguments) elif args.index_type == "our-pq": logger.info("Instantiating CustomIndexPQ") index = CustomIndexPQ(d, **hyperparams) logger.info("Training the index") index.train(embs) logger.info("Adding all embeddings to the index") index.add(embs) logger.info(f"Writing index to disk - {args.output_path}") if args.index_type == "our-pq": with args.output_path.open("wb") as f: pickle.dump(index, f) else: faiss.write_index(index, str(args.output_path)) ================================================ FILE: github_adventures/product_quantization/parse.py ================================================ from __future__ import annotations import argparse import io import logging import pathlib import tqdm import numpy as np logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) def get_embeddings(path: str, maximum: int | None = None) -> tuple[list[str], np.ndarray]: fin = io.open(path, 'r', encoding='utf-8', newline='\n', errors='ignore') n, d = map(int, fin.readline().split()) n = n if maximum is None else min(n, maximum) embs: np.ndarray = np.empty((n, d), dtype=np.float32) words: list[str] = [] for i, line in tqdm.tqdm(enumerate(fin)): if maximum is not None and i == maximum: break tokens = line.rstrip().split(' ') words.append(tokens[0]) embs[i] = list(map(float, tokens[1:])) return words, embs parser = argparse.ArgumentParser() parser.add_argument( "fasttext_path", type=pathlib.Path, help="Path to fasttext embeddings.", ) parser.add_argument( "output_dir", type=pathlib.Path, help="Directory where we store the words and the embeddings." ) parser.add_argument( "-m", "--max", type=int, help="Maximum number of embeddings to parse." ) args = parser.parse_args() path_embs = args.output_dir / "embs.npy" path_words = args.output_dir / "words.txt" args.output_dir.mkdir(exist_ok=True, parents=True) logger.info("Parsing") words, embs = get_embeddings(args.fasttext_path, maximum=args.max) logger.info("Saving words") with path_words.open("w") as f: for word in words: f.write(word + "\n") logger.info("Saving embeddings") np.save(path_embs, embs) ================================================ FILE: github_adventures/product_quantization/requirements.txt ================================================ faiss-cpu==1.7.2 gradio==3.0.17 numpy==1.22.4 pandas==1.4.2 scikit-learn==1.1.1 ================================================ FILE: github_adventures/product_quantization/run_all.sh ================================================ set -ex # Parameters URL=https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz RAW_FASTTEXT=raw_fasttext.vec MAX_WORDS=100000 OUTPUT_FOLDER=new_results # no slash SCIKIT_KWARGS='--n_init 1 --max_iter 30 --init random' # Download fasttext embeddings if [ ! -f $RAW_FASTTEXT ] then curl $URL --output $RAW_FASTTEXT.gz gzip -d $RAW_FASTTEXT.gz fi mkdir $OUTPUT_FOLDER # Parse raw data python parse.py $RAW_FASTTEXT $OUTPUT_FOLDER -m $MAX_WORDS # Generate a couple of different indexes python generate_index.py \ $OUTPUT_FOLDER/embs.npy \ faiss-flat \ $OUTPUT_FOLDER/flat.faiss python generate_index.py \ $OUTPUT_FOLDER/embs.npy \ faiss-pq \ $OUTPUT_FOLDER/faisspq_m4_nbits8.faiss \ --m 4 \ --nbits 8 python generate_index.py \ $OUTPUT_FOLDER/embs.npy \ faiss-pq \ $OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \ --m 12 \ --nbits 8 python generate_index.py \ $OUTPUT_FOLDER/embs.npy \ our-pq \ $OUTPUT_FOLDER/custompq_m4_nbits8.pkl \ --m 4 \ --nbits 8 \ $SCIKIT_KWARGS python generate_index.py \ $OUTPUT_FOLDER/embs.npy \ our-pq \ $OUTPUT_FOLDER/custompq_m12_nbits8.pkl \ --m 12 \ --nbits 8 \ $SCIKIT_KWARGS # Convert faiss index into custom index python convert.py \ $OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \ $OUTPUT_FOLDER/converted_faisspq_m12_nbits8.pkl # Run webapp GRADIO_SERVER_PORT=7777 python run_gradio.py \ $OUTPUT_FOLDER/flat.faiss \ $OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \ $OUTPUT_FOLDER/converted_faisspq_m12_nbits8.pkl \ $OUTPUT_FOLDER/words.txt \ & GRADIO_SERVER_PORT=7778 python run_gradio.py \ $OUTPUT_FOLDER/flat.faiss \ $OUTPUT_FOLDER/faisspq_m4_nbits8.faiss \ $OUTPUT_FOLDER/faisspq_m12_nbits8.faiss \ $OUTPUT_FOLDER/words.txt \ & GRADIO_SERVER_PORT=7779 python run_gradio.py \ $OUTPUT_FOLDER/flat.faiss \ $OUTPUT_FOLDER/custompq_m4_nbits8.pkl \ $OUTPUT_FOLDER/custompq_m12_nbits8.pkl \ $OUTPUT_FOLDER/words.txt \ & # make sure to kill the gradio processes pkill -f gradio ================================================ FILE: github_adventures/product_quantization/run_gradio.py ================================================ from __future__ import annotations import argparse import logging import pathlib import pickle import time from functools import partial from typing import Any import faiss import gradio as gr import numpy as np import pandas as pd logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument( "exact_index_path", type=pathlib.Path, help="Path to the exact index", ) parser.add_argument( "approximate_index_path", type=pathlib.Path, nargs="+", help="Path to the approximate index", ) parser.add_argument( "words_path", type=pathlib.Path, help="Path to the text file containing words", ) args = parser.parse_args() def run( word: str, k: int, exact_index, approximate_indexes: dict[str, Any], words: list[str], word2ix: dict[str, int], ) -> tuple[pd.DataFrame, pd.DataFrame, dict[str, float]]: metrics = {} emb = exact_index.reconstruct(word2ix[word]) start = time.monotonic() D, I = exact_index.search(emb[None, :], k) metrics["time_exact"] = time.monotonic() - start D, I = D[0], I[0] df_e = pd.DataFrame({ "ix": I, "distance": D, "word": [words[i] for i in I], }) dfs_a = [] for name, approximate_index in approximate_indexes.items(): start = time.monotonic() D, I = approximate_index.search(emb[None, :], k) metrics[f"time_approximate_{name}"] = time.monotonic() - start D, I = D[0], I[0] df_a = pd.DataFrame({ "ix": I, "distance": D, "word": [words[i] for i in I], }) dfs_a.append(df_a) metrics[f"recall_{name}"] = len(np.intersect1d(df_e.word.unique(), df_a.word.unique())) / k return df_e, *dfs_a, metrics logger.info(f"Loading words {args.words_path}") words = args.words_path.read_text().strip().split("\n") word2ix = {word: i for i, word in enumerate(words)} logger.info(f"Loading exact index {args.exact_index_path}") exact_index = faiss.read_index(str(args.exact_index_path)) logger.info(f"Loading approximate indexes {args.approximate_index_path}") approximate_indexes = { } for path in args.approximate_index_path: if path.suffix in {".pkl", "pickle"}: with path.open("rb") as f: approximate_indexes[path.stem] = pickle.load(f) else: approximate_indexes[path.stem] = faiss.read_index(str(path)) # Sanity checks assert isinstance(exact_index, faiss.IndexFlat) # assert len(words) == exact_index.ntotal == approximate_index.ntotal run_partial = partial( run, exact_index=exact_index, approximate_indexes=approximate_indexes, words=words, word2ix=word2ix, ) setattr(run_partial, "__name__", "run_function") demo = gr.Interface( fn=run_partial, inputs=[ gr.Textbox(lines=1, placeholder="Word here..."), gr.Slider(minimum=1, maximum=20, value=5, step=1), ], outputs=[ gr.DataFrame(label="exact"), *[gr.DataFrame(label=name) for name in approximate_indexes.keys()], gr.JSON(label="metrics"), ], allow_flagging="never", ) demo.launch() ================================================ FILE: github_adventures/siren/activations.py ================================================ import pathlib from functools import partial import torch from torch.utils.tensorboard import SummaryWriter from core import ImageSiren torch.manual_seed(2) init_functions = { "ones": torch.nn.init.ones_, "eye": torch.nn.init.eye_, "default": partial(torch.nn.init.kaiming_uniform_, a=5 ** (1 / 2)), "paper": None, } for fname, func in init_functions.items(): path = pathlib.Path.cwd() / "tensorboard_logs" / fname writer = SummaryWriter(path) def fh(inst, inp, out, number=0): layer_name = f"{number}_{inst.__class__.__name__}" writer.add_histogram(layer_name, out) model = ImageSiren( hidden_layers=10, hidden_features=200, first_omega=30, hidden_omega=30, custom_init_function_=func, ) for i, layer in enumerate(model.net.modules()): if not i: continue layer.register_forward_hook(partial(fh, number=(i + 1) // 2)) inp = 2 * (torch.rand(10000, 2) - 0.5) writer.add_histogram("0", inp) res = model(inp) ================================================ FILE: github_adventures/siren/core.py ================================================ import numpy as np import torch import torch.nn as nn from scipy.ndimage import laplace, sobel from torch.utils.data import Dataset def paper_init_(weight, is_first=False, omega=1): """Initialize the weigth of the Linear layer. Parameters ---------- weight : torch.Tensor The learnable 2D weight matrix. is_first : bool If True, this Linear layer is the very first one in the network. omega : float Hyperparamter. """ in_features = weight.shape[1] with torch.no_grad(): if is_first: bound = 1 / in_features else: bound = np.sqrt(6 / in_features) / omega weight.uniform_(-bound, bound) class SineLayer(nn.Module): """Linear layer followed by the sine activation. Parameters ---------- in_features : int Number of input features. out_features : int Number of output features. bias : bool If True, the bias is included. is_first : bool If True, then it represents the first layer of the network. Note that it influences the initialization scheme. omega : int Hyperparameter. Determines scaling. custom_init_function_ : None or callable If None, then we are going to use the `paper_init_` defined above. Otherwise, any callable that modifies the `weight` parameter in place. Attributes ---------- linear : nn.Linear Linear layer. """ def __init__( self, in_features, out_features, bias=True, is_first=False, omega=30, custom_init_function_=None, ): super().__init__() self.omega = omega self.linear = nn.Linear(in_features, out_features, bias=bias) if custom_init_function_ is None: paper_init_(self.linear.weight, is_first=is_first, omega=omega) else: custom_init_function_(self.linear.weight) def forward(self, x): """Run forward pass. Parameters ---------- x : torch.Tensor Tensor of shape `(n_samples, in_features)`. Returns ------- torch.Tensor Tensor of shape `(n_samples, out_features). """ return torch.sin(self.omega * self.linear(x)) class ImageSiren(nn.Module): """Network composed of SineLayers. Parameters ---------- hidden_features : int Number of hidden features (each hidden layer the same). hidden_layers : int Number of hidden layers. first_omega, hidden_omega : float Hyperparameter influencing scaling. custom_init_function_ : None or callable If None, then we are going to use the `paper_init_` defined above. Otherwise any callable that modifies the `weight` parameter in place. Attributes ---------- net : nn.Sequential Sequential collection of `SineLayer` and `nn.Linear` at the end. """ def __init__( self, hidden_features, hidden_layers=1, first_omega=30, hidden_omega=30, custom_init_function_=None, ): super().__init__() in_features = 2 out_features = 1 net = [] net.append( SineLayer( in_features, hidden_features, is_first=True, custom_init_function_=custom_init_function_, omega=first_omega, ) ) for _ in range(hidden_layers): net.append( SineLayer( hidden_features, hidden_features, is_first=False, custom_init_function_=custom_init_function_, omega=hidden_omega, ) ) final_linear = nn.Linear(hidden_features, out_features) if custom_init_function_ is None: paper_init_(final_linear.weight, is_first=False, omega=hidden_omega) else: custom_init_function_(final_linear.weight) net.append(final_linear) self.net = nn.Sequential(*net) def forward(self, x): """Run forward pass. Parameters ---------- x : torch.Tensor Tensor of shape `(n_samples, 2)` representing the 2D pixel coordinates. Returns ------- torch.Tensor Tensor of shape `(n_samples, 1)` representing the predicted intensities. """ return self.net(x) def generate_coordinates(n): """Generate regular grid of 2D coordinates on [0, n] x [0, n]. Parameters ---------- n : int Number of points per dimension. Returns ------- coords_abs : np.ndarray Array of row and column coordinates of shape `(n ** 2, 2)`. """ rows, cols = np.meshgrid(range(n), range(n), indexing="ij") coords_abs = np.stack([rows.ravel(), cols.ravel()], axis=-1) return coords_abs class PixelDataset(Dataset): """Dataset yielding coordinates, intensitives and (higher) derivatives. Parameters ---------- img : np.ndarray 2D image representing a grayscale image. Attributes ---------- size : int Height and width of the square image. coords_abs : np.ndarray Array of shape `(size ** 2, 2)` representing all coordinates of the `img`. grad : np.ndarray Array of shape `(size, size, 2)` representing the approximate gradient in the two directions. grad_norm : np.ndarray Array of shape `(size, size)` representing the approximate gradient norm of `img`. laplace : np.ndarray Array of shape `(size, size)` representing the approximate laplace operator. """ def __init__(self, img): if not (img.ndim == 2 and img.shape[0] == img.shape[1]): raise ValueError("Only 2D square images are supported.") self.img = img self.size = img.shape[0] self.coords_abs = generate_coordinates(self.size) self.grad = np.stack([sobel(img, axis=0), sobel(img, axis=1)], axis=-1) self.grad_norm = np.linalg.norm(self.grad, axis=-1) self.laplace = laplace(img) def __len__(self): """Determine the number of samples (pixels).""" return self.size ** 2 def __getitem__(self, idx): """Get all relevant data for a single coordinate.""" coords_abs = self.coords_abs[idx] r, c = coords_abs coords = 2 * ((coords_abs / self.size) - 0.5) return { "coords": coords, "coords_abs": coords_abs, "intensity": self.img[r, c], "grad_norm": self.grad_norm[r, c], "grad": self.grad[r, c], "laplace": self.laplace[r, c], } class GradientUtils: @staticmethod def gradient(target, coords): """Compute the gradient with respect to input. Parameters ---------- target : torch.Tensor 2D tensor of shape `(n_coords, ?)` representing the targets. coords : torch.Tensor 2D tensor fo shape `(n_coords, 2)` representing the coordinates. Returns ------- grad : torch.Tensor 2D tensor of shape `(n_coords, 2)` representing the gradient. """ return torch.autograd.grad( target, coords, grad_outputs=torch.ones_like(target), create_graph=True )[0] @staticmethod def divergence(grad, coords): """Compute divergence. Parameters ---------- grad : torch.Tensor 2D tensor of shape `(n_coords, 2)` representing the gradient wrt x and y. coords : torch.Tensor 2D tensor of shape `(n_coords, 2)` representing the coordinates. Returns ------- div : torch.Tensor 2D tensor of shape `(n_coords, 1)` representing the divergence. Notes ----- In a 2D case this will give us f_{xx} + f_{yy}. """ div = 0.0 for i in range(coords.shape[1]): div += torch.autograd.grad( grad[..., i], coords, torch.ones_like(grad[..., i]), create_graph=True, )[0][..., i : i + 1] return div @staticmethod def laplace(target, coords): """Compute laplace operator. Parameters ---------- target : torch.Tensor 2D tesnor of shape `(n_coords, 1)` representing the targets. coords : torch.Tensor 2D tensor of shape `(n_coords, 2)` representing the coordinates. Returns ------- torch.Tensor 2D tensor of shape `(n_coords, 1)` representing the laplace. """ grad = GradientUtils.gradient(target, coords) return GradientUtils.divergence(grad, coords) ================================================ FILE: github_adventures/siren/train.py ================================================ import matplotlib.pyplot as plt import numpy as np import torch from torch.nn import Linear, ReLU, Sequential from torch.utils.data import DataLoader import tqdm from core import GradientUtils, ImageSiren, PixelDataset # Image loading img_ = plt.imread("dog.png") downsampling_factor = 4 img = 2 * (img_ - 0.5) img = img[::downsampling_factor, ::downsampling_factor] size = img.shape[0] dataset = PixelDataset(img) # Parameters n_epochs = 100 batch_size = int(size ** 2) logging_freq = 20 model_name = "siren" # "siren", "mlp_relu" hidden_features = 256 hidden_layers = 3 target = "intensity" # "intensity", "grad", "laplace" # Model creation if model_name == "siren": model = ImageSiren( hidden_features, hidden_layers=hidden_layers, hidden_omega=30, ) elif model_name == "mlp_relu": layers = [Linear(2, hidden_features), ReLU()] for _ in range(hidden_layers): layers.append(Linear(hidden_features, hidden_features)) layers.append(ReLU()) layers.append(Linear(hidden_features, 1)) model = Sequential(*layers) for module in model.modules(): if not isinstance(module, Linear): continue torch.nn.init.xavier_normal_(module.weight) else: raise ValueError("Unsupported model") dataloader = DataLoader(dataset, batch_size=batch_size) optim = torch.optim.Adam(lr=1e-4, params=model.parameters()) # Training loop for e in range(n_epochs): losses = [] for d_batch in tqdm.tqdm(dataloader): x_batch = d_batch["coords"].to(torch.float32) x_batch.requires_grad = True y_true_batch = d_batch["intensity"].to(torch.float32) y_true_batch = y_true_batch[:, None] y_pred_batch = model(x_batch) if target == "intensity": loss = ((y_true_batch - y_pred_batch) ** 2).mean() elif target == "grad": y_pred_g_batch = GradientUtils.gradient(y_pred_batch, x_batch) y_true_g_batch = d_batch["grad"].to(torch.float32) loss = ((y_true_g_batch - y_pred_g_batch) ** 2).mean() elif target == "laplace": y_pred_l_batch = GradientUtils.laplace(y_pred_batch, x_batch) y_true_l_batch = d_batch["laplace"].to(torch.float32)[:, None] loss = ((y_true_l_batch - y_pred_l_batch) ** 2).mean() else: raise ValueError("Unrecognized target") losses.append(loss.item()) optim.zero_grad() loss.backward() optim.step() print(e, np.mean(losses)) if e % logging_freq == 0: pred_img = np.zeros_like(img) pred_img_grad_norm = np.zeros_like(img) pred_img_laplace = np.zeros_like(img) orig_img = np.zeros_like(img) for d_batch in tqdm.tqdm(dataloader): coords = d_batch["coords"].to(torch.float32) coords.requires_grad = True coords_abs = d_batch["coords_abs"].numpy() pred = model(coords) pred_n = pred.detach().numpy().squeeze() pred_g = ( GradientUtils.gradient(pred, coords) .norm(dim=-1) .detach() .numpy() .squeeze() ) pred_l = GradientUtils.laplace(pred, coords).detach().numpy().squeeze() pred_img[coords_abs[:, 0], coords_abs[:, 1]] = pred_n pred_img_grad_norm[coords_abs[:, 0], coords_abs[:, 1]] = pred_g pred_img_laplace[coords_abs[:, 0], coords_abs[:, 1]] = pred_l fig, axs = plt.subplots(3, 2, constrained_layout=True) axs[0, 0].imshow(dataset.img, cmap="gray") axs[0, 1].imshow(pred_img, cmap="gray") axs[1, 0].imshow(dataset.grad_norm, cmap="gray") axs[1, 1].imshow(pred_img_grad_norm, cmap="gray") axs[2, 0].imshow(dataset.laplace, cmap="gray") axs[2, 1].imshow(pred_img_laplace, cmap="gray") for row in axs: for ax in row: ax.set_axis_off() fig.suptitle(f"Iteration: {e}") axs[0, 0].set_title("Ground truth") axs[0, 1].set_title("Prediction") plt.savefig(f"visualization/{e}.png") ================================================ FILE: github_adventures/vision_transformer/classes.txt ================================================ tench, Tinca_tinca goldfish, Carassius_auratus great_white_shark, white_shark, man-eater, man-eating_shark, Carcharodon_carcharias tiger_shark, Galeocerdo_cuvieri hammerhead, hammerhead_shark electric_ray, crampfish, numbfish, torpedo stingray cock hen ostrich, Struthio_camelus brambling, Fringilla_montifringilla goldfinch, Carduelis_carduelis house_finch, linnet, Carpodacus_mexicanus junco, snowbird indigo_bunting, indigo_finch, indigo_bird, Passerina_cyanea robin, American_robin, Turdus_migratorius bulbul jay magpie chickadee water_ouzel, dipper kite bald_eagle, American_eagle, Haliaeetus_leucocephalus vulture great_grey_owl, great_gray_owl, Strix_nebulosa European_fire_salamander, Salamandra_salamandra common_newt, Triturus_vulgaris eft spotted_salamander, Ambystoma_maculatum axolotl, mud_puppy, Ambystoma_mexicanum bullfrog, Rana_catesbeiana tree_frog, tree-frog tailed_frog, bell_toad, ribbed_toad, tailed_toad, Ascaphus_trui loggerhead, loggerhead_turtle, Caretta_caretta leatherback_turtle, leatherback, leathery_turtle, Dermochelys_coriacea mud_turtle terrapin box_turtle, box_tortoise banded_gecko common_iguana, iguana, Iguana_iguana American_chameleon, anole, Anolis_carolinensis whiptail, whiptail_lizard agama frilled_lizard, Chlamydosaurus_kingi alligator_lizard Gila_monster, Heloderma_suspectum green_lizard, Lacerta_viridis African_chameleon, Chamaeleo_chamaeleon Komodo_dragon, Komodo_lizard, dragon_lizard, giant_lizard, Varanus_komodoensis African_crocodile, Nile_crocodile, Crocodylus_niloticus American_alligator, Alligator_mississipiensis triceratops thunder_snake, worm_snake, Carphophis_amoenus ringneck_snake, ring-necked_snake, ring_snake hognose_snake, puff_adder, sand_viper green_snake, grass_snake king_snake, kingsnake garter_snake, grass_snake water_snake vine_snake night_snake, Hypsiglena_torquata boa_constrictor, Constrictor_constrictor rock_python, rock_snake, Python_sebae Indian_cobra, Naja_naja green_mamba sea_snake horned_viper, cerastes, sand_viper, horned_asp, Cerastes_cornutus diamondback, diamondback_rattlesnake, Crotalus_adamanteus sidewinder, horned_rattlesnake, Crotalus_cerastes trilobite harvestman, daddy_longlegs, Phalangium_opilio scorpion black_and_gold_garden_spider, Argiope_aurantia barn_spider, Araneus_cavaticus garden_spider, Aranea_diademata black_widow, Latrodectus_mactans tarantula wolf_spider, hunting_spider tick centipede black_grouse ptarmigan ruffed_grouse, partridge, Bonasa_umbellus prairie_chicken, prairie_grouse, prairie_fowl peacock quail partridge African_grey, African_gray, Psittacus_erithacus macaw sulphur-crested_cockatoo, Kakatoe_galerita, Cacatua_galerita lorikeet coucal bee_eater hornbill hummingbird jacamar toucan drake red-breasted_merganser, Mergus_serrator goose black_swan, Cygnus_atratus tusker echidna, spiny_anteater, anteater platypus, duckbill, duckbilled_platypus, duck-billed_platypus, Ornithorhynchus_anatinus wallaby, brush_kangaroo koala, koala_bear, kangaroo_bear, native_bear, Phascolarctos_cinereus wombat jellyfish sea_anemone, anemone brain_coral flatworm, platyhelminth nematode, nematode_worm, roundworm conch snail slug sea_slug, nudibranch chiton, coat-of-mail_shell, sea_cradle, polyplacophore chambered_nautilus, pearly_nautilus, nautilus Dungeness_crab, Cancer_magister rock_crab, Cancer_irroratus fiddler_crab king_crab, Alaska_crab, Alaskan_king_crab, Alaska_king_crab, Paralithodes_camtschatica American_lobster, Northern_lobster, Maine_lobster, Homarus_americanus spiny_lobster, langouste, rock_lobster, crawfish, crayfish, sea_crawfish crayfish, crawfish, crawdad, crawdaddy hermit_crab isopod white_stork, Ciconia_ciconia black_stork, Ciconia_nigra spoonbill flamingo little_blue_heron, Egretta_caerulea American_egret, great_white_heron, Egretta_albus bittern crane limpkin, Aramus_pictus European_gallinule, Porphyrio_porphyrio American_coot, marsh_hen, mud_hen, water_hen, Fulica_americana bustard ruddy_turnstone, Arenaria_interpres red-backed_sandpiper, dunlin, Erolia_alpina redshank, Tringa_totanus dowitcher oystercatcher, oyster_catcher pelican king_penguin, Aptenodytes_patagonica albatross, mollymawk grey_whale, gray_whale, devilfish, Eschrichtius_gibbosus, Eschrichtius_robustus killer_whale, killer, orca, grampus, sea_wolf, Orcinus_orca dugong, Dugong_dugon sea_lion Chihuahua Japanese_spaniel Maltese_dog, Maltese_terrier, Maltese Pekinese, Pekingese, Peke Shih-Tzu Blenheim_spaniel papillon toy_terrier Rhodesian_ridgeback Afghan_hound, Afghan basset, basset_hound beagle bloodhound, sleuthhound bluetick black-and-tan_coonhound Walker_hound, Walker_foxhound English_foxhound redbone borzoi, Russian_wolfhound Irish_wolfhound Italian_greyhound whippet Ibizan_hound, Ibizan_Podenco Norwegian_elkhound, elkhound otterhound, otter_hound Saluki, gazelle_hound Scottish_deerhound, deerhound Weimaraner Staffordshire_bullterrier, Staffordshire_bull_terrier American_Staffordshire_terrier, Staffordshire_terrier, American_pit_bull_terrier, pit_bull_terrier Bedlington_terrier Border_terrier Kerry_blue_terrier Irish_terrier Norfolk_terrier Norwich_terrier Yorkshire_terrier wire-haired_fox_terrier Lakeland_terrier Sealyham_terrier, Sealyham Airedale, Airedale_terrier cairn, cairn_terrier Australian_terrier Dandie_Dinmont, Dandie_Dinmont_terrier Boston_bull, Boston_terrier miniature_schnauzer giant_schnauzer standard_schnauzer Scotch_terrier, Scottish_terrier, Scottie Tibetan_terrier, chrysanthemum_dog silky_terrier, Sydney_silky soft-coated_wheaten_terrier West_Highland_white_terrier Lhasa, Lhasa_apso flat-coated_retriever curly-coated_retriever golden_retriever Labrador_retriever Chesapeake_Bay_retriever German_short-haired_pointer vizsla, Hungarian_pointer English_setter Irish_setter, red_setter Gordon_setter Brittany_spaniel clumber, clumber_spaniel English_springer, English_springer_spaniel Welsh_springer_spaniel cocker_spaniel, English_cocker_spaniel, cocker Sussex_spaniel Irish_water_spaniel kuvasz schipperke groenendael malinois briard kelpie komondor Old_English_sheepdog, bobtail Shetland_sheepdog, Shetland_sheep_dog, Shetland collie Border_collie Bouvier_des_Flandres, Bouviers_des_Flandres Rottweiler German_shepherd, German_shepherd_dog, German_police_dog, alsatian Doberman, Doberman_pinscher miniature_pinscher Greater_Swiss_Mountain_dog Bernese_mountain_dog Appenzeller EntleBucher boxer bull_mastiff Tibetan_mastiff French_bulldog Great_Dane Saint_Bernard, St_Bernard Eskimo_dog, husky malamute, malemute, Alaskan_malamute Siberian_husky dalmatian, coach_dog, carriage_dog affenpinscher, monkey_pinscher, monkey_dog basenji pug, pug-dog Leonberg Newfoundland, Newfoundland_dog Great_Pyrenees Samoyed, Samoyede Pomeranian chow, chow_chow keeshond Brabancon_griffon Pembroke, Pembroke_Welsh_corgi Cardigan, Cardigan_Welsh_corgi toy_poodle miniature_poodle standard_poodle Mexican_hairless timber_wolf, grey_wolf, gray_wolf, Canis_lupus white_wolf, Arctic_wolf, Canis_lupus_tundrarum red_wolf, maned_wolf, Canis_rufus, Canis_niger coyote, prairie_wolf, brush_wolf, Canis_latrans dingo, warrigal, warragal, Canis_dingo dhole, Cuon_alpinus African_hunting_dog, hyena_dog, Cape_hunting_dog, Lycaon_pictus hyena, hyaena red_fox, Vulpes_vulpes kit_fox, Vulpes_macrotis Arctic_fox, white_fox, Alopex_lagopus grey_fox, gray_fox, Urocyon_cinereoargenteus tabby, tabby_cat tiger_cat Persian_cat Siamese_cat, Siamese Egyptian_cat cougar, puma, catamount, mountain_lion, painter, panther, Felis_concolor lynx, catamount leopard, Panthera_pardus snow_leopard, ounce, Panthera_uncia jaguar, panther, Panthera_onca, Felis_onca lion, king_of_beasts, Panthera_leo tiger, Panthera_tigris cheetah, chetah, Acinonyx_jubatus brown_bear, bruin, Ursus_arctos American_black_bear, black_bear, Ursus_americanus, Euarctos_americanus ice_bear, polar_bear, Ursus_Maritimus, Thalarctos_maritimus sloth_bear, Melursus_ursinus, Ursus_ursinus mongoose meerkat, mierkat tiger_beetle ladybug, ladybeetle, lady_beetle, ladybird, ladybird_beetle ground_beetle, carabid_beetle long-horned_beetle, longicorn, longicorn_beetle leaf_beetle, chrysomelid dung_beetle rhinoceros_beetle weevil fly bee ant, emmet, pismire grasshopper, hopper cricket walking_stick, walkingstick, stick_insect cockroach, roach mantis, mantid cicada, cicala leafhopper lacewing, lacewing_fly dragonfly, darning_needle, devil's_darning_needle, sewing_needle, snake_feeder, snake_doctor, mosquito_hawk, skeeter_hawk damselfly admiral ringlet, ringlet_butterfly monarch, monarch_butterfly, milkweed_butterfly, Danaus_plexippus cabbage_butterfly sulphur_butterfly, sulfur_butterfly lycaenid, lycaenid_butterfly starfish, sea_star sea_urchin sea_cucumber, holothurian wood_rabbit, cottontail, cottontail_rabbit hare Angora, Angora_rabbit hamster porcupine, hedgehog fox_squirrel, eastern_fox_squirrel, Sciurus_niger marmot beaver guinea_pig, Cavia_cobaya sorrel zebra hog, pig, grunter, squealer, Sus_scrofa wild_boar, boar, Sus_scrofa warthog hippopotamus, hippo, river_horse, Hippopotamus_amphibius ox water_buffalo, water_ox, Asiatic_buffalo, Bubalus_bubalis bison ram, tup bighorn, bighorn_sheep, cimarron, Rocky_Mountain_bighorn, Rocky_Mountain_sheep, Ovis_canadensis ibex, Capra_ibex hartebeest impala, Aepyceros_melampus gazelle Arabian_camel, dromedary, Camelus_dromedarius llama weasel mink polecat, fitch, foulmart, foumart, Mustela_putorius black-footed_ferret, ferret, Mustela_nigripes otter skunk, polecat, wood_pussy badger armadillo three-toed_sloth, ai, Bradypus_tridactylus orangutan, orang, orangutang, Pongo_pygmaeus gorilla, Gorilla_gorilla chimpanzee, chimp, Pan_troglodytes gibbon, Hylobates_lar siamang, Hylobates_syndactylus, Symphalangus_syndactylus guenon, guenon_monkey patas, hussar_monkey, Erythrocebus_patas baboon macaque langur colobus, colobus_monkey proboscis_monkey, Nasalis_larvatus marmoset capuchin, ringtail, Cebus_capucinus howler_monkey, howler titi, titi_monkey spider_monkey, Ateles_geoffroyi squirrel_monkey, Saimiri_sciureus Madagascar_cat, ring-tailed_lemur, Lemur_catta indri, indris, Indri_indri, Indri_brevicaudatus Indian_elephant, Elephas_maximus African_elephant, Loxodonta_africana lesser_panda, red_panda, panda, bear_cat, cat_bear, Ailurus_fulgens giant_panda, panda, panda_bear, coon_bear, Ailuropoda_melanoleuca barracouta, snoek eel coho, cohoe, coho_salmon, blue_jack, silver_salmon, Oncorhynchus_kisutch rock_beauty, Holocanthus_tricolor anemone_fish sturgeon gar, garfish, garpike, billfish, Lepisosteus_osseus lionfish puffer, pufferfish, blowfish, globefish abacus abaya academic_gown, academic_robe, judge's_robe accordion, piano_accordion, squeeze_box acoustic_guitar aircraft_carrier, carrier, flattop, attack_aircraft_carrier airliner airship, dirigible altar ambulance amphibian, amphibious_vehicle analog_clock apiary, bee_house apron ashcan, trash_can, garbage_can, wastebin, ash_bin, ash-bin, ashbin, dustbin, trash_barrel, trash_bin assault_rifle, assault_gun backpack, back_pack, knapsack, packsack, rucksack, haversack bakery, bakeshop, bakehouse balance_beam, beam balloon ballpoint, ballpoint_pen, ballpen, Biro Band_Aid banjo bannister, banister, balustrade, balusters, handrail barbell barber_chair barbershop barn barometer barrel, cask barrow, garden_cart, lawn_cart, wheelbarrow baseball basketball bassinet bassoon bathing_cap, swimming_cap bath_towel bathtub, bathing_tub, bath, tub beach_wagon, station_wagon, wagon, estate_car, beach_waggon, station_waggon, waggon beacon, lighthouse, beacon_light, pharos beaker bearskin, busby, shako beer_bottle beer_glass bell_cote, bell_cot bib bicycle-built-for-two, tandem_bicycle, tandem bikini, two-piece binder, ring-binder binoculars, field_glasses, opera_glasses birdhouse boathouse bobsled, bobsleigh, bob bolo_tie, bolo, bola_tie, bola bonnet, poke_bonnet bookcase bookshop, bookstore, bookstall bottlecap bow bow_tie, bow-tie, bowtie brass, memorial_tablet, plaque brassiere, bra, bandeau breakwater, groin, groyne, mole, bulwark, seawall, jetty breastplate, aegis, egis broom bucket, pail buckle bulletproof_vest bullet_train, bullet butcher_shop, meat_market cab, hack, taxi, taxicab caldron, cauldron candle, taper, wax_light cannon canoe can_opener, tin_opener cardigan car_mirror carousel, carrousel, merry-go-round, roundabout, whirligig carpenter's_kit, tool_kit carton car_wheel cash_machine, cash_dispenser, automated_teller_machine, automatic_teller_machine, automated_teller, automatic_teller, ATM cassette cassette_player castle catamaran CD_player cello, violoncello cellular_telephone, cellular_phone, cellphone, cell, mobile_phone chain chainlink_fence chain_mail, ring_mail, mail, chain_armor, chain_armour, ring_armor, ring_armour chain_saw, chainsaw chest chiffonier, commode chime, bell, gong china_cabinet, china_closet Christmas_stocking church, church_building cinema, movie_theater, movie_theatre, movie_house, picture_palace cleaver, meat_cleaver, chopper cliff_dwelling cloak clog, geta, patten, sabot cocktail_shaker coffee_mug coffeepot coil, spiral, volute, whorl, helix combination_lock computer_keyboard, keypad confectionery, confectionary, candy_store container_ship, containership, container_vessel convertible corkscrew, bottle_screw cornet, horn, trumpet, trump cowboy_boot cowboy_hat, ten-gallon_hat cradle crane crash_helmet crate crib, cot Crock_Pot croquet_ball crutch cuirass dam, dike, dyke desk desktop_computer dial_telephone, dial_phone diaper, nappy, napkin digital_clock digital_watch dining_table, board dishrag, dishcloth dishwasher, dish_washer, dishwashing_machine disk_brake, disc_brake dock, dockage, docking_facility dogsled, dog_sled, dog_sleigh dome doormat, welcome_mat drilling_platform, offshore_rig drum, membranophone, tympan drumstick dumbbell Dutch_oven electric_fan, blower electric_guitar electric_locomotive entertainment_center envelope espresso_maker face_powder feather_boa, boa file, file_cabinet, filing_cabinet fireboat fire_engine, fire_truck fire_screen, fireguard flagpole, flagstaff flute, transverse_flute folding_chair football_helmet forklift fountain fountain_pen four-poster freight_car French_horn, horn frying_pan, frypan, skillet fur_coat garbage_truck, dustcart gasmask, respirator, gas_helmet gas_pump, gasoline_pump, petrol_pump, island_dispenser goblet go-kart golf_ball golfcart, golf_cart gondola gong, tam-tam gown grand_piano, grand greenhouse, nursery, glasshouse grille, radiator_grille grocery_store, grocery, food_market, market guillotine hair_slide hair_spray half_track hammer hamper hand_blower, blow_dryer, blow_drier, hair_dryer, hair_drier hand-held_computer, hand-held_microcomputer handkerchief, hankie, hanky, hankey hard_disc, hard_disk, fixed_disk harmonica, mouth_organ, harp, mouth_harp harp harvester, reaper hatchet holster home_theater, home_theatre honeycomb hook, claw hoopskirt, crinoline horizontal_bar, high_bar horse_cart, horse-cart hourglass iPod iron, smoothing_iron jack-o'-lantern jean, blue_jean, denim jeep, landrover jersey, T-shirt, tee_shirt jigsaw_puzzle jinrikisha, ricksha, rickshaw joystick kimono knee_pad knot lab_coat, laboratory_coat ladle lampshade, lamp_shade laptop, laptop_computer lawn_mower, mower lens_cap, lens_cover letter_opener, paper_knife, paperknife library lifeboat lighter, light, igniter, ignitor limousine, limo liner, ocean_liner lipstick, lip_rouge Loafer lotion loudspeaker, speaker, speaker_unit, loudspeaker_system, speaker_system loupe, jeweler's_loupe lumbermill, sawmill magnetic_compass mailbag, postbag mailbox, letter_box maillot maillot, tank_suit manhole_cover maraca marimba, xylophone mask matchstick maypole maze, labyrinth measuring_cup medicine_chest, medicine_cabinet megalith, megalithic_structure microphone, mike microwave, microwave_oven military_uniform milk_can minibus miniskirt, mini minivan missile mitten mixing_bowl mobile_home, manufactured_home Model_T modem monastery monitor moped mortar mortarboard mosque mosquito_net motor_scooter, scooter mountain_bike, all-terrain_bike, off-roader mountain_tent mouse, computer_mouse mousetrap moving_van muzzle nail neck_brace necklace nipple notebook, notebook_computer obelisk oboe, hautboy, hautbois ocarina, sweet_potato odometer, hodometer, mileometer, milometer oil_filter organ, pipe_organ oscilloscope, scope, cathode-ray_oscilloscope, CRO overskirt oxcart oxygen_mask packet paddle, boat_paddle paddlewheel, paddle_wheel padlock paintbrush pajama, pyjama, pj's, jammies palace panpipe, pandean_pipe, syrinx paper_towel parachute, chute parallel_bars, bars park_bench parking_meter passenger_car, coach, carriage patio, terrace pay-phone, pay-station pedestal, plinth, footstall pencil_box, pencil_case pencil_sharpener perfume, essence Petri_dish photocopier pick, plectrum, plectron pickelhaube picket_fence, paling pickup, pickup_truck pier piggy_bank, penny_bank pill_bottle pillow ping-pong_ball pinwheel pirate, pirate_ship pitcher, ewer plane, carpenter's_plane, woodworking_plane planetarium plastic_bag plate_rack plow, plough plunger, plumber's_helper Polaroid_camera, Polaroid_Land_camera pole police_van, police_wagon, paddy_wagon, patrol_wagon, wagon, black_Maria poncho pool_table, billiard_table, snooker_table pop_bottle, soda_bottle pot, flowerpot potter's_wheel power_drill prayer_rug, prayer_mat printer prison, prison_house projectile, missile projector puck, hockey_puck punching_bag, punch_bag, punching_ball, punchball purse quill, quill_pen quilt, comforter, comfort, puff racer, race_car, racing_car racket, racquet radiator radio, wireless radio_telescope, radio_reflector rain_barrel recreational_vehicle, RV, R.V. reel reflex_camera refrigerator, icebox remote_control, remote restaurant, eating_house, eating_place, eatery revolver, six-gun, six-shooter rifle rocking_chair, rocker rotisserie rubber_eraser, rubber, pencil_eraser rugby_ball rule, ruler running_shoe safe safety_pin saltshaker, salt_shaker sandal sarong sax, saxophone scabbard scale, weighing_machine school_bus schooner scoreboard screen, CRT_screen screw screwdriver seat_belt, seatbelt sewing_machine shield, buckler shoe_shop, shoe-shop, shoe_store shoji shopping_basket shopping_cart shovel shower_cap shower_curtain ski ski_mask sleeping_bag slide_rule, slipstick sliding_door slot, one-armed_bandit snorkel snowmobile snowplow, snowplough soap_dispenser soccer_ball sock solar_dish, solar_collector, solar_furnace sombrero soup_bowl space_bar space_heater space_shuttle spatula speedboat spider_web, spider's_web spindle sports_car, sport_car spotlight, spot stage steam_locomotive steel_arch_bridge steel_drum stethoscope stole stone_wall stopwatch, stop_watch stove strainer streetcar, tram, tramcar, trolley, trolley_car stretcher studio_couch, day_bed stupa, tope submarine, pigboat, sub, U-boat suit, suit_of_clothes sundial sunglass sunglasses, dark_glasses, shades sunscreen, sunblock, sun_blocker suspension_bridge swab, swob, mop sweatshirt swimming_trunks, bathing_trunks swing switch, electric_switch, electrical_switch syringe table_lamp tank, army_tank, armored_combat_vehicle, armoured_combat_vehicle tape_player teapot teddy, teddy_bear television, television_system tennis_ball thatch, thatched_roof theater_curtain, theatre_curtain thimble thresher, thrasher, threshing_machine throne tile_roof toaster tobacco_shop, tobacconist_shop, tobacconist toilet_seat torch totem_pole tow_truck, tow_car, wrecker toyshop tractor trailer_truck, tractor_trailer, trucking_rig, rig, articulated_lorry, semi tray trench_coat tricycle, trike, velocipede trimaran tripod triumphal_arch trolleybus, trolley_coach, trackless_trolley trombone tub, vat turnstile typewriter_keyboard umbrella unicycle, monocycle upright, upright_piano vacuum, vacuum_cleaner vase vault velvet vending_machine vestment viaduct violin, fiddle volleyball waffle_iron wall_clock wallet, billfold, notecase, pocketbook wardrobe, closet, press warplane, military_plane washbasin, handbasin, washbowl, lavabo, wash-hand_basin washer, automatic_washer, washing_machine water_bottle water_jug water_tower whiskey_jug whistle wig window_screen window_shade Windsor_tie wine_bottle wing wok wooden_spoon wool, woolen, woollen worm_fence, snake_fence, snake-rail_fence, Virginia_fence wreck yawl yurt web_site, website, internet_site, site comic_book crossword_puzzle, crossword street_sign traffic_light, traffic_signal, stoplight book_jacket, dust_cover, dust_jacket, dust_wrapper menu plate guacamole consomme hot_pot, hotpot trifle ice_cream, icecream ice_lolly, lolly, lollipop, popsicle French_loaf bagel, beigel pretzel cheeseburger hotdog, hot_dog, red_hot mashed_potato head_cabbage broccoli cauliflower zucchini, courgette spaghetti_squash acorn_squash butternut_squash cucumber, cuke artichoke, globe_artichoke bell_pepper cardoon mushroom Granny_Smith strawberry orange lemon fig pineapple, ananas banana jackfruit, jak, jack custard_apple pomegranate hay carbonara chocolate_sauce, chocolate_syrup dough meat_loaf, meatloaf pizza, pizza_pie potpie burrito red_wine espresso cup eggnog alp bubble cliff, drop, drop-off coral_reef geyser lakeside, lakeshore promontory, headland, head, foreland sandbar, sand_bar seashore, coast, seacoast, sea-coast valley, vale volcano ballplayer, baseball_player groom, bridegroom scuba_diver rapeseed daisy yellow_lady's_slipper, yellow_lady-slipper, Cypripedium_calceolus, Cypripedium_parviflorum corn acorn hip, rose_hip, rosehip buckeye, horse_chestnut, conker coral_fungus agaric gyromitra stinkhorn, carrion_fungus earthstar hen-of-the-woods, hen_of_the_woods, Polyporus_frondosus, Grifola_frondosa bolete ear, spike, capitulum toilet_tissue, toilet_paper, bathroom_tissue ================================================ FILE: github_adventures/vision_transformer/custom.py ================================================ import torch import torch.nn as nn class PatchEmbed(nn.Module): """Split image into patches and then embed them. Parameters ---------- img_size : int Size of the image (it is a square). patch_size : int Size of the patch (it is a square). in_chans : int Number of input channels. embed_dim : int The emmbedding dimension. Attributes ---------- n_patches : int Number of patches inside of our image. proj : nn.Conv2d Convolutional layer that does both the splitting into patches and their embedding. """ def __init__(self, img_size, patch_size, in_chans=3, embed_dim=768): super().__init__() self.img_size = img_size self.patch_size = patch_size self.n_patches = (img_size // patch_size) ** 2 self.proj = nn.Conv2d( in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, ) def forward(self, x): """Run forward pass. Parameters ---------- x : torch.Tensor Shape `(n_samples, in_chans, img_size, img_size)`. Returns ------- torch.Tensor Shape `(n_samples, n_patches, embed_dim)`. """ x = self.proj( x ) # (n_samples, embed_dim, n_patches ** 0.5, n_patches ** 0.5) x = x.flatten(2) # (n_samples, embed_dim, n_patches) x = x.transpose(1, 2) # (n_samples, n_patches, embed_dim) return x class Attention(nn.Module): """Attention mechanism. Parameters ---------- dim : int The input and out dimension of per token features. n_heads : int Number of attention heads. qkv_bias : bool If True then we include bias to the query, key and value projections. attn_p : float Dropout probability applied to the query, key and value tensors. proj_p : float Dropout probability applied to the output tensor. Attributes ---------- scale : float Normalizing consant for the dot product. qkv : nn.Linear Linear projection for the query, key and value. proj : nn.Linear Linear mapping that takes in the concatenated output of all attention heads and maps it into a new space. attn_drop, proj_drop : nn.Dropout Dropout layers. """ def __init__(self, dim, n_heads=12, qkv_bias=True, attn_p=0., proj_p=0.): super().__init__() self.n_heads = n_heads self.dim = dim self.head_dim = dim // n_heads self.scale = self.head_dim ** -0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_p) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_p) def forward(self, x): """Run forward pass. Parameters ---------- x : torch.Tensor Shape `(n_samples, n_patches + 1, dim)`. Returns ------- torch.Tensor Shape `(n_samples, n_patches + 1, dim)`. """ n_samples, n_tokens, dim = x.shape if dim != self.dim: raise ValueError qkv = self.qkv(x) # (n_samples, n_patches + 1, 3 * dim) qkv = qkv.reshape( n_samples, n_tokens, 3, self.n_heads, self.head_dim ) # (n_smaples, n_patches + 1, 3, n_heads, head_dim) qkv = qkv.permute( 2, 0, 3, 1, 4 ) # (3, n_samples, n_heads, n_patches + 1, head_dim) q, k, v = qkv[0], qkv[1], qkv[2] k_t = k.transpose(-2, -1) # (n_samples, n_heads, head_dim, n_patches + 1) dp = ( q @ k_t ) * self.scale # (n_samples, n_heads, n_patches + 1, n_patches + 1) attn = dp.softmax(dim=-1) # (n_samples, n_heads, n_patches + 1, n_patches + 1) attn = self.attn_drop(attn) weighted_avg = attn @ v # (n_samples, n_heads, n_patches +1, head_dim) weighted_avg = weighted_avg.transpose( 1, 2 ) # (n_samples, n_patches + 1, n_heads, head_dim) weighted_avg = weighted_avg.flatten(2) # (n_samples, n_patches + 1, dim) x = self.proj(weighted_avg) # (n_samples, n_patches + 1, dim) x = self.proj_drop(x) # (n_samples, n_patches + 1, dim) return x class MLP(nn.Module): """Multilayer perceptron. Parameters ---------- in_features : int Number of input features. hidden_features : int Number of nodes in the hidden layer. out_features : int Number of output features. p : float Dropout probability. Attributes ---------- fc : nn.Linear The First linear layer. act : nn.GELU GELU activation function. fc2 : nn.Linear The second linear layer. drop : nn.Dropout Dropout layer. """ def __init__(self, in_features, hidden_features, out_features, p=0.): super().__init__() self.fc1 = nn.Linear(in_features, hidden_features) self.act = nn.GELU() self.fc2 = nn.Linear(hidden_features, out_features) self.drop = nn.Dropout(p) def forward(self, x): """Run forward pass. Parameters ---------- x : torch.Tensor Shape `(n_samples, n_patches + 1, in_features)`. Returns ------- torch.Tensor Shape `(n_samples, n_patches +1, out_features)` """ x = self.fc1( x ) # (n_samples, n_patches + 1, hidden_features) x = self.act(x) # (n_samples, n_patches + 1, hidden_features) x = self.drop(x) # (n_samples, n_patches + 1, hidden_features) x = self.fc2(x) # (n_samples, n_patches + 1, out_features) x = self.drop(x) # (n_samples, n_patches + 1, out_features) return x class Block(nn.Module): """Transformer block. Parameters ---------- dim : int Embeddinig dimension. n_heads : int Number of attention heads. mlp_ratio : float Determines the hidden dimension size of the `MLP` module with respect to `dim`. qkv_bias : bool If True then we include bias to the query, key and value projections. p, attn_p : float Dropout probability. Attributes ---------- norm1, norm2 : LayerNorm Layer normalization. attn : Attention Attention module. mlp : MLP MLP module. """ def __init__(self, dim, n_heads, mlp_ratio=4.0, qkv_bias=True, p=0., attn_p=0.): super().__init__() self.norm1 = nn.LayerNorm(dim, eps=1e-6) self.attn = Attention( dim, n_heads=n_heads, qkv_bias=qkv_bias, attn_p=attn_p, proj_p=p ) self.norm2 = nn.LayerNorm(dim, eps=1e-6) hidden_features = int(dim * mlp_ratio) self.mlp = MLP( in_features=dim, hidden_features=hidden_features, out_features=dim, ) def forward(self, x): """Run forward pass. Parameters ---------- x : torch.Tensor Shape `(n_samples, n_patches + 1, dim)`. Returns ------- torch.Tensor Shape `(n_samples, n_patches + 1, dim)`. """ x = x + self.attn(self.norm1(x)) x = x + self.mlp(self.norm2(x)) return x class VisionTransformer(nn.Module): """Simplified implementation of the Vision transformer. Parameters ---------- img_size : int Both height and the width of the image (it is a square). patch_size : int Both height and the width of the patch (it is a square). in_chans : int Number of input channels. n_classes : int Number of classes. embed_dim : int Dimensionality of the token/patch embeddings. depth : int Number of blocks. n_heads : int Number of attention heads. mlp_ratio : float Determines the hidden dimension of the `MLP` module. qkv_bias : bool If True then we include bias to the query, key and value projections. p, attn_p : float Dropout probability. Attributes ---------- patch_embed : PatchEmbed Instance of `PatchEmbed` layer. cls_token : nn.Parameter Learnable parameter that will represent the first token in the sequence. It has `embed_dim` elements. pos_emb : nn.Parameter Positional embedding of the cls token + all the patches. It has `(n_patches + 1) * embed_dim` elements. pos_drop : nn.Dropout Dropout layer. blocks : nn.ModuleList List of `Block` modules. norm : nn.LayerNorm Layer normalization. """ def __init__( self, img_size=384, patch_size=16, in_chans=3, n_classes=1000, embed_dim=768, depth=12, n_heads=12, mlp_ratio=4., qkv_bias=True, p=0., attn_p=0., ): super().__init__() self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ) self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) self.pos_embed = nn.Parameter( torch.zeros(1, 1 + self.patch_embed.n_patches, embed_dim) ) self.pos_drop = nn.Dropout(p=p) self.blocks = nn.ModuleList( [ Block( dim=embed_dim, n_heads=n_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, p=p, attn_p=attn_p, ) for _ in range(depth) ] ) self.norm = nn.LayerNorm(embed_dim, eps=1e-6) self.head = nn.Linear(embed_dim, n_classes) def forward(self, x): """Run the forward pass. Parameters ---------- x : torch.Tensor Shape `(n_samples, in_chans, img_size, img_size)`. Returns ------- logits : torch.Tensor Logits over all the classes - `(n_samples, n_classes)`. """ n_samples = x.shape[0] x = self.patch_embed(x) cls_token = self.cls_token.expand( n_samples, -1, -1 ) # (n_samples, 1, embed_dim) x = torch.cat((cls_token, x), dim=1) # (n_samples, 1 + n_patches, embed_dim) x = x + self.pos_embed # (n_samples, 1 + n_patches, embed_dim) x = self.pos_drop(x) for block in self.blocks: x = block(x) x = self.norm(x) cls_token_final = x[:, 0] # just the CLS token x = self.head(cls_token_final) return x ================================================ FILE: github_adventures/vision_transformer/forward.py ================================================ import numpy as np from PIL import Image import torch k = 10 imagenet_labels = dict(enumerate(open("classes.txt"))) model = torch.load("model.pth") model.eval() img = (np.array(Image.open("cat.png")) / 128) - 1 # in the range -1, 1 inp = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).to(torch.float32) logits = model(inp) probs = torch.nn.functional.softmax(logits, dim=-1) top_probs, top_ixs = probs[0].topk(k) for i, (ix_, prob_) in enumerate(zip(top_ixs, top_probs)): ix = ix_.item() prob = prob_.item() cls = imagenet_labels[ix].strip() print(f"{i}: {cls:<45} --- {prob:.4f}") ================================================ FILE: github_adventures/vision_transformer/verify.py ================================================ import numpy as np import timm import torch from custom import VisionTransformer # Helpers def get_n_params(module): return sum(p.numel() for p in module.parameters() if p.requires_grad) def assert_tensors_equal(t1, t2): a1, a2 = t1.detach().numpy(), t2.detach().numpy() np.testing.assert_allclose(a1, a2) model_name = "vit_base_patch16_384" model_official = timm.create_model(model_name, pretrained=True) model_official.eval() print(type(model_official)) custom_config = { "img_size": 384, "in_chans": 3, "patch_size": 16, "embed_dim": 768, "depth": 12, "n_heads": 12, "qkv_bias": True, "mlp_ratio": 4, } model_custom = VisionTransformer(**custom_config) model_custom.eval() for (n_o, p_o), (n_c, p_c) in zip( model_official.named_parameters(), model_custom.named_parameters() ): assert p_o.numel() == p_c.numel() print(f"{n_o} | {n_c}") p_c.data[:] = p_o.data assert_tensors_equal(p_c.data, p_o.data) inp = torch.rand(1, 3, 384, 384) res_c = model_custom(inp) res_o = model_official(inp) # Asserts assert get_n_params(model_custom) == get_n_params(model_official) assert_tensors_equal(res_c, res_o) # Save custom model torch.save(model_custom, "model.pth") ================================================ FILE: mini_tutorials/bentoml/README.md ================================================ 1. [Resources](#resources) 2. [Installation](#installation) 3. [Instructions](#instructions) 1. [`bentoml`](#bentoml) 1. [`bentoctl`](#bentoctl) 1. [`aws` CLI](#aws-cli) 4. [Sketches](#sketches) # Resources * https://docs.bentoml.com/en/latest/ * https://github.com/bentoml/bentoctl * https://github.com/bentoml/aws-sagemaker-deploy # Installation ```bash pip install -r requirements.txt ``` See below the actual versions at the time of making the video ```txt bentoctl==0.4.0 bentoml==1.1.9 boto3==1.29.0 numpy==1.26.2 pydantic==2.5.1 pydantic_core==2.14.3 scikit-learn==1.3.2 ``` # Instructions ## `bentoml` Creating a model ```bash python create_model.py ``` Listing all existing models ```bash bentoml models list ``` Build a bento ```bash bentoml build ``` List all existing bentos ```bash bentoml list ``` Serve a bento locally ```bash bentoml serve $BENTO ``` Serve a `service.py` (development) ```bash bentoml serve service.py ``` ## `bentoctl` Install SageMaker operator ```bash bentoctl operator install aws-sagemaker ``` Initialize ```bash bentoctl init ``` ATTENTION: All of the below assumes that you have correctly set up AWS secret keys and permissions. Build custom customized SageMaker image and push to ECR ```bash bentoctl build -f deployment_config.yaml -b $BENTO ``` Initialize terraform ```bash terraform init ``` Look at what changes will be applied ```bash terraform plan -var-file=bentoctl.tfvars ``` Actually apply changes ```bash terraform apply -var-file=bentoctl.tfvars ``` Send request to the API Gateway ```bash curl -X 'POST' "$URL/classify" -H 'accept: application/json' -H 'Content-Type: application/json' -d '{ "sepal_width": 0, "sepal_length": 0, "petal_width": 0, "petal_length": 0 }' ``` Destroy resources (not including ECR) ```bash terraform destroy -var-file=bentoctl.tfvars ``` Destroy resources including ECR) ```bash bentoctl destroy ``` ## `aws` CLI Describe repositories ```bash aws ecr describe-repositories ``` List all images in the repository `amazing-iris` ```bash aws ecr list-images --repository-name=amazing-iris ``` List SageMaker models ```bash aws sagemaker list-models ``` List SageMaker endpoints ```bash aws sagemaker list-endpoints ``` # Sketches bentoml-overview sklearn-sagemaker ================================================ FILE: mini_tutorials/bentoml/bentofile.yaml ================================================ service: "service:svc" include: - "service.py" python: packages: - pydantic - scikit-learn models: - iris_clf:latest ================================================ FILE: mini_tutorials/bentoml/create_model.py ================================================ import bentoml from sklearn import datasets from sklearn import svm iris = datasets.load_iris() X, y = iris.data, iris.target clf = svm.SVC(gamma="scale") clf.fit(X, y) saved_model = bentoml.sklearn.save_model("iris_clf", clf) print(saved_model) ================================================ FILE: mini_tutorials/bentoml/requirements.txt ================================================ bentoctl bentoml boto3 numpy pydantic scikit-learn ================================================ FILE: mini_tutorials/bentoml/service.py ================================================ from typing import Literal import bentoml from pydantic import BaseModel from bentoml.io import JSON iris_clf_runner = bentoml.sklearn.get("iris_clf:latest").to_runner() svc = bentoml.Service("iris_classifier", runners=[iris_clf_runner]) class Request(BaseModel): sepal_width: float sepal_length: float petal_width: float petal_length: float class Response(BaseModel): label: Literal["setosa", "versicolor", "virginica"] @svc.api(input=JSON(pydantic_model=Request), output=JSON(pydantic_model=Response)) def classify(request: Request) -> Response: input_ = [ request.sepal_width, request.sepal_length, request.petal_width, request.petal_length, ] label_index = iris_clf_runner.predict.run([input_])[0] label = ["setosa", "versicolor", "virginica"][label_index] return Response(label=label) ================================================ FILE: mini_tutorials/custom_optimizer_in_pytorch/custom.py ================================================ import numpy as np import torch from torch.optim import Optimizer class WeirdDescent(Optimizer): """Take a coordinate descent step for a random parameter. And also, make every 100th step way bigger. """ def __init__(self, parameters, lr=1e-3): defaults = {"lr": lr} super().__init__(parameters, defaults) def step(self, closure=None): loss = None if closure is not None: loss = closure() if not self.state: self.state["step"] = 1 else: self.state["step"] += 1 c = 1 if self.state["step"] % 100 == 0: c = 100 grad = None while grad is None: param_group = np.random.choice(self.param_groups) tensor = np.random.choice(param_group["params"]) grad = tensor.grad.data element_ix = np.random.randint(tensor.numel()) mask_flat = torch.zeros(tensor.numel()) mask_flat[element_ix] = 1 mask = mask_flat.reshape(tensor.shape) tensor.data.add_(grad * mask, alpha=-param_group["lr"] * c) return loss ================================================ FILE: mini_tutorials/custom_optimizer_in_pytorch/src.py ================================================ from matplotlib.animation import FuncAnimation import matplotlib.pyplot as plt import numpy as np import torch from torch.optim import Adam, SGD from tqdm import tqdm from custom import WeirdDescent def rosenbrock(xy): """Evaluate Rosenbrock function. Parameters ---------- xy : tuple Two element tuple of floats representing the x resp. y coordinates. Returns ------- float The Rosenbrock function evaluated at the point `xy`. """ x, y = xy return (1 - x) ** 2 + 100 * (y - x ** 2) ** 2 def run_optimization(xy_init, optimizer_class, n_iter, **optimizer_kwargs): """Run optimization finding the minimum of the Rosenbrock function. Parameters ---------- xy_init : tuple Two floats representing the x resp. y coordinates. optimizer_class : object Optimizer class. n_iter : int Number of iterations to run the optimization for. optimizer_kwargs : dict Additional parameters to be passed into the optimizer. Returns ------- path : np.ndarray 2D array of shape `(n_iter + 1, 2)`. Where the rows represent the iteration and the columns represent the x resp. y coordinates. """ xy_t = torch.tensor(xy_init, requires_grad=True) optimizer = optimizer_class([xy_t], **optimizer_kwargs) path = np.empty((n_iter + 1, 2)) path[0, :] = xy_init for i in tqdm(range(1, n_iter + 1)): optimizer.zero_grad() loss = rosenbrock(xy_t) loss.backward() torch.nn.utils.clip_grad_norm_(xy_t, 1.0) optimizer.step() path[i, :] = xy_t.detach().numpy() return path def create_animation(paths, colors, names, figsize=(12, 12), x_lim=(-2, 2), y_lim=(-1, 3), n_seconds=5): """Create an animation. Parameters ---------- paths : list List of arrays representing the paths (history of x,y coordinates) the optimizer went through. colors : list List of strings representing colors for each path. names : list List of strings representing names for each path. figsize : tuple Size of the figure. x_lim, y_lim : tuple Range of the x resp. y axis. n_seconds : int Number of seconds the animation should last. Returns ------- anim : FuncAnimation Animation of the paths of all the optimizers. """ if not (len(paths) == len(colors) == len(names)): raise ValueError path_length = max(len(path) for path in paths) n_points = 300 x = np.linspace(*x_lim, n_points) y = np.linspace(*y_lim, n_points) X, Y = np.meshgrid(x, y) Z = rosenbrock([X, Y]) minimum = (1.0, 1.0) fig, ax = plt.subplots(figsize=figsize) ax.contour(X, Y, Z, 90, cmap="jet") scatters = [ax.scatter(None, None, label=label, c=c) for c, label in zip(colors, names)] ax.legend(prop={"size": 25}) ax.plot(*minimum, "rD") def animate(i): for path, scatter in zip(paths, scatters): scatter.set_offsets(path[:i, :]) ax.set_title(str(i)) ms_per_frame = 1000 * n_seconds / path_length anim = FuncAnimation(fig, animate, frames=path_length, interval=ms_per_frame) return anim if __name__ == "__main__": xy_init = (.3, .8) n_iter = 1500 path_adam = run_optimization(xy_init, Adam, n_iter) path_sgd = run_optimization(xy_init, SGD, n_iter, lr=1e-3) path_weird = run_optimization(xy_init, WeirdDescent, n_iter, lr=1e-3) freq = 10 paths = [path_adam[::freq], path_sgd[::freq], path_weird[::freq]] colors = ["green", "blue", "black"] names = ["Adam", "SGD", "Weird"] anim = create_animation(paths, colors, names, figsize=(12, 7), x_lim=(-.1, 1.1), y_lim=(-.1, 1.1), n_seconds=7) anim.save("result.gif") print(path_weird[-15:]) ================================================ FILE: mini_tutorials/deploying_on_kubernetes/Dockerfile ================================================ FROM huggingface/transformers-pytorch-gpu RUN python3 -c "from transformers import AutoModel;AutoModel.from_pretrained('bert-base-uncased')" RUN python3 -c "from transformers import AutoTokenizer;AutoTokenizer.from_pretrained('bert-base-uncased')" RUN pip install fastapi uvicorn EXPOSE 8888 ENTRYPOINT ["transformers-cli", "serve", "--port=8888", "--host=0.0.0.0", "--task=fill-mask", "--model=bert-base-uncased"] ================================================ FILE: mini_tutorials/deploying_on_kubernetes/DockerfileConda ================================================ FROM continuumio/miniconda3 RUN conda install -c conda-forge pytorch-cpu RUN conda install -c conda-forge fastapi RUN conda install -c conda-forge uvicorn RUN conda install -c huggingface transformers RUN conda install -c conda-forge huggingface_hub=0.2.1 RUN python3 -c "from transformers import AutoModel;AutoModel.from_pretrained('bert-base-uncased')" RUN python3 -c "from transformers import AutoTokenizer;AutoTokenizer.from_pretrained('bert-base-uncased')" EXPOSE 8888 ENTRYPOINT ["transformers-cli", "serve", "--port=8888", "--host=0.0.0.0", "--task=fill-mask", "--model=bert-base-uncased"] ================================================ FILE: mini_tutorials/deploying_on_kubernetes/README.md ================================================ # Relevant commands ## Creating an API ```bash transformers-cli serve --task=fill-mask --model=bert-base-uncased ``` ```bash curl http://localhost:8888 | jq ``` ```bash curl -X POST http://localhost:8888/forward -H "accept: application/json" -H "Content-Type: application/json" -d '{"inputs": "Today is going to be a [MASK] day"}' | jq ``` ## Containerization Build first image. ```bash docker build -t cool-api:v1 . ``` Build second image. ```bash docker build -t cool-api:v2 -f DockerfileConda . ``` Run image. ```bash docker run -it --rm -P cool-api:v2 ``` ## Deploying on Kubernetes Start a minikube cluster. ```bash minikube start ``` Get all objects across all namespaces. ```bash kubectl get all -A ``` List images. ```bash minikube image list ``` Load an image. ```bash minikube image cool-api:v2 ``` Create a deployment. ```bash kubectl create deploy cool-deploy --image=cool-api:v2 ``` Create a service. ```bash kubectl expose deploy/cool-deploy --name=cool-service --target-port=8888 --port=1234 ``` Scale up. ```bash kubectl scale deploy/cool-deploy --replicas=3 ``` Get logs. ```bash kubectl logs -f PODFULLNAME ``` ================================================ FILE: mini_tutorials/embedding/README.md ================================================ # Training data The Dracula book can be found here: https://archive.org/stream/draculabr00stokuoft/draculabr00stokuoft_djvu.txt ================================================ FILE: mini_tutorials/embedding/Visualize.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "id": "incredible-backup", "metadata": {}, "outputs": [], "source": [ "import ipywidgets\n", "import matplotlib.pyplot as plt\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "id": "proud-accreditation", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"res.csv\")\n", "last_epoch = df[\"epoch\"].max()" ] }, { "cell_type": "code", "execution_count": null, "id": "canadian-nightlife", "metadata": {}, "outputs": [], "source": [ "@ipywidgets.interact\n", "def f(epoch=ipywidgets.IntSlider(min=0, max=last_epoch , continuous_update=False)):\n", " fig, ax = plt.subplots(1, 1, figsize=(12, 8))\n", " ax.set_xlim([-2, 2])\n", " ax.set_ylim([-2, 2])\n", " df_iter = df[df[\"epoch\"] == epoch]\n", " df_iter.plot(kind='scatter', x='dim_0',y='dim_1', ax=ax, c=\"red\")\n", " df_iter[['dim_0','dim_1','character']].apply(lambda row:\n", " ax.text(row[\"dim_0\"] + 0.02,\n", " row[\"dim_1\"] + 0.01,\n", " row[\"character\"],\n", " fontsize=18),\n", " axis=1)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "early-vinyl", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: mini_tutorials/embedding/src.py ================================================ from collections import Counter, defaultdict import numpy as np import pandas as pd import torch from torch.nn import Embedding, Linear, LSTM, Module import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler from tqdm import tqdm class CharacterDataset(Dataset): """Custom dataset. Parameters ---------- text : str Input text that will be used to create the entire database. window_size : int Number of characters to use as input features. vocab_size : int Number of characters in the vocabulary. Note that the last character is always reserved for a special "~" out-of-vocabulary character. Attributes ---------- ch2ix : defaultdict Mapping from the character to the position of that character in the vocabulary. Note that all characters that are not in the vocabulary will get mapped into the index `vocab_size - 1`. ix2ch : dict Mapping from the character position in the vocabulary to the actual character. vocabulary : list List of all characters. `len(vocabulary) == vocab_size`. """ def __init__(self, text, window_size=1, vocab_size=50): self.text = text.replace("\n", " ") self.window_size = window_size self.ch2ix = defaultdict(lambda: vocab_size - 1) most_common_ch2ix = { x[0]: i for i, x in enumerate(Counter(self.text).most_common()[: (vocab_size - 1)]) } self.ch2ix.update(most_common_ch2ix) self.ch2ix["~"] = vocab_size - 1 self.ix2ch = {v: k for k, v in self.ch2ix.items()} self.vocabulary = [self.ix2ch[i] for i in range(vocab_size)] def __len__(self): return len(self.text) - self.window_size def __getitem__(self, ix): X = torch.LongTensor( [self.ch2ix[c] for c in self.text[ix : ix + self.window_size]] ) y = self.ch2ix[self.text[ix + self.window_size]] return X, y class Network(Module): """Custom network predicting the next character of a string. Parameters ---------- vocab_size : int The number of characters in the vocabulary. embedding_dim : int Dimension of the character embedding vectors. dense_dim : int Number of neurons in the linear layer that follows the LSTM. hidden_dim : int Size of the LSTM hidden state. max_norm : int If any of the embedding vectors has a higher L2 norm than `max_norm` it is rescaled. n_layers : int Number of the layers of the LSTM. """ def __init__( self, vocab_size, embedding_dim=2, dense_dim=32, hidden_dim=8, max_norm=2, n_layers=1, ): super().__init__() self.embedding = Embedding( vocab_size, embedding_dim, padding_idx=vocab_size - 1, norm_type=2, max_norm=max_norm, ) self.lstm = LSTM( embedding_dim, hidden_dim, batch_first=True, num_layers=n_layers ) self.linear_1 = Linear(hidden_dim, dense_dim) self.linear_2 = Linear(dense_dim, vocab_size) def forward(self, x, h=None, c=None): """Run the forward pass. Parameters ---------- x : torch.Tensor Input tensor of shape `(n_samples, window_size)` of dtype `torch.int64`. h, c : torch.Tensor or None Hidden states of the LSTM. Returns ------- logits : torch.Tensor Tensor of shape `(n_samples, vocab_size)`. h, c : torch.Tensor or None Hidden states of the LSTM. """ emb = self.embedding(x) # (n_samples, window_size, embedding_dim) if h is not None and c is not None: _, (h, c) = self.lstm(emb, (h, c)) else: _, (h, c) = self.lstm(emb) # (n_layers, n_samples, hidden_dim) h_mean = h.mean(dim=0) # (n_samples, hidden_dim) x = self.linear_1(h_mean) # (n_samples, dense_dim) logits = self.linear_2(x) # (n_samples, vocab_size) return logits, h, c def compute_loss(cal, net, dataloader): """Computer average loss over a dataset.""" net.eval() all_losses = [] for X_batch, y_batch in dataloader: probs, _, _ = net(X_batch) all_losses.append(cal(probs, y_batch).item()) return np.mean(all_losses) def generate_text(n_chars, net, dataset, initial_text="Hello", random_state=None): """Generate text with the character-level model. Parameters ---------- n_chars : int Number of characters to generate. net : Module Character-level model. dataset : CharacterDataset Instance of the `CharacterDataset`. initial_text : str The starting text to be used as the initial condition for the model. random_state : None or int If not None, then the result is reproducible. Returns ------- res : str Generated text. """ if not initial_text: raise ValueError("You need to specify the initial text") res = initial_text net.eval() h, c = None, None if random_state is not None: np.random.seed(random_state) for _ in range(n_chars): previous_chars = initial_text if res == initial_text else res[-1] features = torch.LongTensor([[dataset.ch2ix[c] for c in previous_chars]]) logits, h, c = net(features, h, c) probs = F.softmax(logits[0], dim=0).detach().numpy() new_ch = np.random.choice(dataset.vocabulary, p=probs) res += new_ch return res if __name__ == "__main__": with open("text.txt", "r") as f: text = "\n".join(f.readlines()) # Hyperparameters model vocab_size = 70 window_size = 10 embedding_dim = 2 hidden_dim = 16 dense_dim = 32 n_layers = 1 max_norm = 2 # Training config n_epochs = 25 train_val_split = 0.8 batch_size = 128 random_state = 13 torch.manual_seed(random_state) loss_f = torch.nn.CrossEntropyLoss() dataset = CharacterDataset(text, window_size=window_size, vocab_size=vocab_size) n_samples = len(dataset) split_ix = int(n_samples * train_val_split) train_indices, val_indices = np.arange(split_ix), np.arange(split_ix, n_samples) train_dataloader = DataLoader( dataset, sampler=SubsetRandomSampler(train_indices), batch_size=batch_size ) val_dataloader = DataLoader( dataset, sampler=SubsetRandomSampler(val_indices), batch_size=batch_size ) net = Network( vocab_size, hidden_dim=hidden_dim, n_layers=n_layers, dense_dim=dense_dim, embedding_dim=embedding_dim, max_norm=max_norm, ) optimizer = torch.optim.Adam( net.parameters(), lr=1e-2, ) emb_history = [] for e in range(n_epochs + 1): net.train() for X_batch, y_batch in tqdm(train_dataloader): if e == 0: break optimizer.zero_grad() probs, _, _ = net(X_batch) loss = loss_f(probs, y_batch) loss.backward() optimizer.step() train_loss = compute_loss(loss_f, net, train_dataloader) val_loss = compute_loss(loss_f, net, val_dataloader) print(f"Epoch: {e}, {train_loss=:.3f}, {val_loss=:.3f}") # Generate one sentence initial_text = "I hope it works " generated_text = generate_text( 100, net, dataset, initial_text=initial_text, random_state=random_state ) print(generated_text) # Prepare DataFrame weights = net.embedding.weight.detach().clone().numpy() df = pd.DataFrame(weights, columns=[f"dim_{i}" for i in range(embedding_dim)]) df["epoch"] = e df["character"] = dataset.vocabulary emb_history.append(df) final_df = pd.concat(emb_history) final_df.to_csv("res.csv", index=False) ================================================ FILE: mini_tutorials/fewshot_text_classification/classify.py ================================================ import pathlib import jinja2 import openai path = pathlib.Path("template.jinja2") with path.open() as f: prompt_template = jinja2.Template(f.read()) labels = [ {"label": 0, "description": "negative sentiment"}, {"label": 1, "description": "neutral sentiment"}, {"label": 2, "description": "positive sentiment"}, ] examples = [ {"text": "Today was a horrible day", "label": 0}, {"text": "Yesterday was a great day", "label": 2}, ] text = "I loved the TV show" prompt = prompt_template.render( examples=examples, labels=labels, text=text, ) print(prompt) completion = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ] ) print(completion.choices[0].message) ================================================ FILE: mini_tutorials/fewshot_text_classification/template.jinja2 ================================================ I want you to classify text for me. See below all the possible labels and their description {% for item in labels %} """ description: {{ item.description }} label: {{ item.label }} """ {% endfor %} {% if examples %} See below a couple of examples {% for item in examples %} """ text: {{ item.text }} label: {{ item.label }} """ {% endfor %} {% endif %} Here is the text that needs to be classified """ text: {{ text }} label: ================================================ FILE: mini_tutorials/gradient_wrt_input/explain.py ================================================ import matplotlib.pyplot as plt import numpy as np import torch import torchvision.models as models from utils import compute_gradient, read_image, scale_grad, to_array def func(inp, net=None, target=None): """Get logit of a target class. Parameters ---------- inp : torch.Tensor Input image (single image batch). net : torch.nn.Module Classifier network. target : int Imagenet ground truth label id. Returns ------- logit : torch.Tensor Logit of the `target` class. """ out = net(inp) logit = out[0, target] return logit def compute_integrated_gradients(inp, baseline, net, target, n_steps=100): """Compute integrated gradients. Parameters ---------- inp : torch.Tensor Input image (single image batch) of shape `(1, 3, *, *)`. baseline : torch.Tensor Basline image of the same shape as the `inp`. net : torch.nn.Module Classifier network. target : int Imagenet ground truth label id. n_steps : int Number of steps between the `inp` and `baseline` tensors. Returns ------- ig : torch.Tensor Integrated gradients with the same shape as the `inp`. inp_grad : torch.Tensor Gradient with respect to the `inp` tensor. Same shape as `inp`. """ path = [baseline + a * (inp - baseline) for a in np.linspace(0, 1, n_steps)] grads = [compute_gradient(func, x, net=net, target=target) for x in path] ig = (inp - baseline) * torch.cat(grads[:-1]).mean(dim=0, keepdims=True) return ig, grads[-1] if __name__ == "__main__": net = models.resnet18(pretrained=True) net.eval() tensor = read_image("img.jpg") arr = to_array(tensor) n_steps = 100 baseline = -1.5 * torch.ones_like(tensor) ig, inp_grad = compute_integrated_gradients( tensor, baseline, net, 291, n_steps=n_steps ) ig_scaled = scale_grad(ig) inp_grad_scaled = scale_grad(inp_grad) _, (ax_baseline, ax_img, ax_inp_grad, ax_ig) = plt.subplots(1, 4, figsize=(19.20,10.80)) ax_baseline.imshow(to_array(baseline)) ax_img.imshow(arr) ax_inp_grad.imshow(arr * inp_grad_scaled) ax_ig.imshow(arr * ig_scaled) ax_baseline.set_title("Baseline") ax_img.set_title("Input") ax_inp_grad.set_title("Gradient input") ax_ig.set_title("Integrated gradients") ax_baseline.axis("off") ax_img.axis("off") ax_inp_grad.axis("off") ax_ig.axis("off") plt.savefig("res_2.png") ================================================ FILE: mini_tutorials/gradient_wrt_input/fool.py ================================================ import matplotlib.pyplot as plt import numpy as np import torch import torchvision.models as models from utils import compute_gradient, read_image, to_array def func(inp, net=None, target=None): """Compute negative log likelihood. Parameters ---------- inp : torch.Tensor Input image (single image batch). net : torch.nn.Module Classifier network. target : int Imagenet ground truth label id. Returns ------- loss : torch.Tensor Loss for the `inp` image. """ out = net(inp) loss = torch.nn.functional.nll_loss(out, target=torch.LongTensor([target])) print(f"Loss: {loss.item()}") return loss def attack(tensor, net, eps=1e-3, n_iter=50): """Run the Fast Sign Gradient Method (FSGM) attack. Parameters ---------- tensor : torch.Tensor The input image of shape `(1, 3, 224, 224)`. net : torch.nn.Module Classifier network. eps : float Determines how much we modify the image in a single iteration. n_iter : int Number of iterations. Returns ------- new_tensor : torch.Tensor New image that is a modification of the input image that "fools" the classifier. """ new_tensor = tensor.detach().clone() orig_prediction = net(tensor).argmax() print(f"Original prediction: {orig_prediction.item()}") for i in range(n_iter): net.zero_grad() grad = compute_gradient( func, new_tensor, net=net, target=orig_prediction.item() ) new_tensor = torch.clamp(new_tensor + eps * grad.sign(), -2, 2) new_prediction = net(new_tensor).argmax() if orig_prediction != new_prediction: print(f"We fooled the network after {i} iterations!") print(f"New prediction: {new_prediction.item()}") break return new_tensor, orig_prediction.item(), new_prediction.item() if __name__ == "__main__": net = models.resnet18(pretrained=True) net.eval() tensor = read_image("img.jpg") new_tensor, orig_prediction, new_prediction = attack( tensor, net, eps=1e-3, n_iter=100 ) _, (ax_orig, ax_new, ax_diff) = plt.subplots(1, 3, figsize=(19.20,10.80)) arr = to_array(tensor) new_arr = to_array(new_tensor) diff_arr = np.abs(arr - new_arr).mean(axis=-1) diff_arr = diff_arr / diff_arr.max() ax_orig.imshow(arr) ax_new.imshow(new_arr) ax_diff.imshow(diff_arr, cmap="gray") ax_orig.axis("off") ax_new.axis("off") ax_diff.axis("off") ax_orig.set_title(f"Original: {orig_prediction}") ax_new.set_title(f"Modified: {new_prediction}") ax_diff.set_title("Difference") plt.savefig("res_1.png") ================================================ FILE: mini_tutorials/gradient_wrt_input/utils.py ================================================ from PIL import Image import torch from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize, ToTensor) def compute_gradient(func, inp, **kwargs): """Compute the gradient with respect to `inp`. Parameters ---------- func : callable Function that takes in `inp` and `kwargs` and returns a single element tensor. inp : torch.Tensor The tensor that we want to get the gradients for. Needs to be a leaf node. **kwargs : dict Additional keyword arguments passed into `func`. Returns ------- grad : torch.Tensor Tensor of the same shape as `inp` that is representing the gradient. """ inp.requires_grad = True loss = func(inp, **kwargs) loss.backward() inp.requires_grad = False return inp.grad.data def read_image(path): """Load image from disk and convert to torch.Tensor. Parameters ---------- path : str Path to the image. Returns ------- tensor : torch.Tensor Single sample batch containing our image (ready to be used with pretrained networks). The shape is `(1, 3, 224, 224)`. """ img = Image.open(path) transform = Compose([Resize(256), CenterCrop(224), ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) tensor_ = transform(img) tensor = tensor_.unsqueeze(0) return tensor def to_array(tensor): """Convert torch.Tensor to np.ndarray. Parameters ---------- tensor : torch.Tensor Tensor of shape `(1, 3, *, *)` representing one sample batch of images. Returns ------- arr : np.ndarray Array of shape `(*, *, 3)` representing an image that can be plotted directly. """ tensor_ = tensor.squeeze() unnormalize_transform = Compose([Normalize(mean=[0, 0, 0], std=[1 / 0.229, 1 / 0.224, 1 / 0.225]), Normalize(mean=[-0.485, -0.456, -0.406], std=[1, 1, 1])]) arr_ = unnormalize_transform(tensor_) arr = arr_.permute(1, 2, 0).detach().numpy() return arr def scale_grad(grad): """Scale gradient tensor. Parameters ---------- grad : torch.Tensor Gradient of shape `(1, 3, *, *)`. Returns ------- grad_arr : np.ndarray Array of shape `(*, *, 1)`. """ grad_arr = torch.abs(grad).mean(dim=1).detach().permute(1, 2, 0) grad_arr /= grad_arr.quantile(0.98) grad_arr = torch.clamp(grad_arr, 0, 1) return grad_arr.numpy() ================================================ FILE: mini_tutorials/haiku_basics/buffers_in_torch.py ================================================ import torch bn = torch.nn.BatchNorm1d(5) bn.state_dict() for name, p in bn.named_buffers(): print(name, p, p.requires_grad) for name, p in bn.named_parameters(): print(name, p, p.requires_grad) ================================================ FILE: mini_tutorials/haiku_basics/parameter.py ================================================ from __future__ import annotations import haiku as hk import jax import jax.numpy as jnp def foo(x: jnp.ndarray) -> jnp.ndarray: c = hk.get_parameter("c", x.shape, init=hk.initializers.RandomNormal(1)) res = c + x key = hk.next_rng_key() mask = jax.random.bernoulli(key, 0.5, x.shape) return res * mask * 2 foo_transformed = hk.transform(foo) init_key = jax.random.PRNGKey(24) apply_key_seq = hk.PRNGSequence(init_key) x = jnp.ones((2, 5)) params = foo_transformed.init(init_key, x) for _ in range(2): res = foo_transformed.apply(params, next(apply_key_seq), x) print(res) ================================================ FILE: mini_tutorials/haiku_basics/reallife.py ================================================ from __future__ import annotations import haiku as hk import jax import jax.numpy as jnp def foo(x: jnp.ndarray) -> jnp.ndarray: mlp = hk.nets.MLP([4, 5, 1]) loss = mlp(x).mean() return loss foo_transformed = hk.without_apply_rng(hk.transform(foo)) init_key = jax.random.PRNGKey(3452) x = jnp.ones((2, 3)) params = foo_transformed.init(init_key, x) grad_foo = jax.jit(jax.grad(foo_transformed.apply)) grads = grad_foo(params, x) ================================================ FILE: mini_tutorials/haiku_basics/requirements.txt ================================================ -e git+ssh://git@github.com/deepmind/dm-haiku.git@386efc098fd52a5cf728e7d13442138ab25eb235#egg=dm_haiku jax==0.3.5 jaxlib==0.3.5 ================================================ FILE: mini_tutorials/haiku_basics/state.py ================================================ from __future__ import annotations import haiku as hk import jax import jax.numpy as jnp def foo(x: jnp.ndarray) -> jnp.ndarray: c = hk.get_parameter("c", x.shape, init=hk.initializers.RandomNormal(1)) counter = hk.get_state( "counter", shape=[], dtype=jnp.int32, init=jnp.ones ) hk.set_state("counter", counter + 1) res = c + x + counter return res foo_transformed = hk.transform_with_state(foo) init_key = jax.random.PRNGKey(32) x = jnp.ones((2, 5)) params, state = foo_transformed.init(init_key, x) for i in range(2): print(f"After {i} iterations") res, state = foo_transformed.apply(params, state, None, x) print(state) print(res) ================================================ FILE: mini_tutorials/httpx_rate_limiting/script.py ================================================ import asyncio import logging import httpx logger = logging.getLogger() logging.getLogger("httpx").setLevel(logging.WARNING) logging.basicConfig(format="%(asctime)s %(name)s %(message)s", level=logging.INFO) async def send_request(client: httpx.AsyncClient, semaphore: asyncio.Semaphore) -> int: url = "https://pokeapi.co/api/v2/pokemon/ditto" async with semaphore: logger.info("Sending request") response = await client.get(url) logger.info("Response received") return response.status_code async def main() -> int: semaphore = asyncio.Semaphore(5) async with httpx.AsyncClient() as client: tasks = [asyncio.create_task(send_request(client, semaphore)) for _ in range(10)] status_codes = await asyncio.gather(*tasks) logger.info("All work done") return 0 if all(c == 200 for c in status_codes) else 1 if __name__ == "__main__": raise SystemExit(asyncio.run(main())) ================================================ FILE: mini_tutorials/mocking_neural_networks/app.py ================================================ import logging import sys import numpy as np import torch from transformers import AutoModelForMaskedLM, AutoTokenizer def get_top_k(sequence, tokenizer, model, k=10): """Get the top k most probable tokens to fill the gap with. Parameters ---------- sequence : str String containing the [MASK] token. tokenizer : BertFastTokenizer Tokenizer. model : BertForMaskedLM Model. k : int Number of the top results to return. Returns ------- top_vocab_indices : torch.Tensor 1D tensor representing the indices of the top tokens. """ batch_enc = tokenizer(sequence, return_tensors="pt") mask_ix = torch.where(batch_enc["input_ids"] == tokenizer.mask_token_id)[1] logits = model(**batch_enc).logits top_vocab_indices = torch.topk(logits[0, mask_ix.item(), :], k)[1] return top_vocab_indices if __name__ == "__main__": logging.disable(logging.WARNING) tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased") sequence = sys.argv[1] top_indices = get_top_k(sequence, tokenizer, model, 5) top_tokens = [tokenizer.decode(torch.tensor([ix])) for ix in top_indices] winner = top_tokens[0] print(np.random.permutation(top_tokens)) guess = input("Who do you think is the winner? ").strip() if guess == winner: print("You won!!!") else: print("You lost!!!") print("\nTrue ranking") for i, x in enumerate(top_tokens): print(i, x) ================================================ FILE: mini_tutorials/mocking_neural_networks/test.py ================================================ from unittest.mock import Mock import pytest import torch from transformers import (AutoTokenizer, AutoModelForMaskedLM, BatchEncoding, BertForMaskedLM, BertTokenizerFast) from app import get_top_k @pytest.mark.parametrize("k", [5, 7]) def test_with_real_objects(k): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased") sequence = "Hello [MASK]" res = get_top_k(sequence, tokenizer, model, k) assert isinstance(res, torch.Tensor) assert res.shape == (k,) @pytest.mark.parametrize("k", [5, 7]) def test_with_mock_objects(k): sequence = "Hello [MASK]" vocab_size = 1000 data = {"input_ids": torch.tensor([[101, 555, 103, 102]])} be = BatchEncoding(data=data) logits = torch.rand(1, 4, vocab_size) tokenizer_m = Mock(spec=BertTokenizerFast, return_value=be, mask_token_id=103) model_m = Mock(spec=BertForMaskedLM) model_m.return_value.logits = logits res = get_top_k(sequence, tokenizer_m, model_m, k=k) assert isinstance(res, torch.Tensor) assert res.shape == (k,) ================================================ FILE: mini_tutorials/numpy_equality_testing/test.py ================================================ import numpy as np import pytest def get_arrays(): """Create 4 arrays that are all similar but different. Returns ------- a : np.ndarray Reference array. a_eps : np.ndarray Same shape as `a`, however, the values are slightly different. a_dim : np.ndarray One extra dimension compared to `a`, however, the values are the same. a_nan : np.ndarray Same shape and same values, however, one entry is set to `np.nan`. """ eps = 1e-5 a = np.array([[1.2, 5.12, 2.4], [5.5, 8.8, 1.55]]) a_eps = a + eps a_dim = a[None, :] # shape (1, 2, 3) a_nan = a.copy() a_nan[0, 1] = np.nan return a, a_eps, a_dim, a_nan def test___eq__(): a, *_ = get_arrays() with pytest.raises(ValueError): assert a == a def test___eq__all(): a, a_eps, a_dim, a_nan = get_arrays() assert (a == a).all() assert not (a == a_eps).all() assert (a == a_dim).all() assert not (a_nan == a_nan).all() def test_array_equal(): a, a_eps, a_dim, a_nan = get_arrays() assert np.array_equal(a, a) assert not np.array_equal(a, a_eps) assert not np.array_equal(a, a_dim) assert not np.array_equal(a_nan, a_nan) assert np.array_equal(a_nan, a_nan, equal_nan=True) def test_allclose(): a, a_eps, a_dim, a_nan = get_arrays() atol = 1e-5 assert np.allclose(a, a, atol=atol) assert np.allclose(a, a_eps, atol=atol) assert np.allclose(a, a_dim, atol=atol) assert not np.allclose(a_nan, a_nan, atol=atol) assert np.allclose(a_nan, a_nan, atol=atol, equal_nan=True) def test_testing_array_equal(): a, a_eps, a_dim, a_nan = get_arrays() np.testing.assert_array_equal(a, a) # np.testing.assert_array_equal(a, a_eps) # np.testing.assert_array_equal(a, a_dim) np.testing.assert_array_equal(a_nan, a_nan) def test_testing_allclose(): a, a_eps, a_dim, a_nan = get_arrays() atol = 1e-5 np.testing.assert_allclose(a, a, atol=atol) np.testing.assert_allclose(a, a_eps, atol=atol) # np.testing.assert_allclose(a, a_dim, atol=atol) np.testing.assert_allclose(a_nan, a_nan, atol=atol) # np.testing.assert_allclose(a_nan, a_nan, atol=atol, equal_nan=False) ================================================ FILE: mini_tutorials/openai_function_calling/example.py ================================================ import json import logging import operator import sys import datetime import openai import yfinance as yf TODAY = datetime.date.today().strftime("%Y/%m/%d") logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(message)s") logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) def get_price(symbol: str, date: str) -> float: logger.info(f"Calling get_price with {symbol=} and {date=}") history = yf.download( symbol, start=date, period="1d", interval="1d", progress=False ) return history["Close"].iloc[0].item() def calculate(a: float, b: float, op: str) -> float: logger.info(f"Calling calculate with {a=}, {b=} and {op=}") return getattr(operator, op)(a, b) get_price_metadata = { "name": "get_price", "description": "Get closing price of a financial instrument on a given date", "parameters": { "type": "object", "properties": { "symbol": { "type": "string", "description": "Ticker symbol of a financial instrument", }, "date": { "type": "string", "description": "Date in the format YYYY-MM-DD", }, }, "required": ["symbol", "date"], }, } calculate_metadata = { "name": "calculate", "description": "General purpose calculator", "parameters": { "type": "object", "properties": { "a": { "type": "number", "description": "First entry", }, "b": { "type": "number", "description": "Second entry", }, "op": { "type": "string", "enum": ["mul", "add", "truediv", "sub"], "description": "Binary operation", }, }, "required": ["a", "b", "op"], }, } messages = [ {"role": "user", "content": sys.argv[1]}, { "role": "system", "content": "You are a helpful financial investor who overlooks the " f"performance of stocks. Today is {TODAY}. Note that the " "format of the date is YYYY/MM/DD", }, ] while True: response = openai.ChatCompletion.create( model="gpt-3.5-turbo-0613", temperature=0, messages=messages, functions=[get_price_metadata, calculate_metadata], ) message = response["choices"][0]["message"] messages.append(message) if "function_call" not in message: break # call custom functions function_name = message["function_call"]["name"] kwargs = json.loads(message["function_call"]["arguments"]) if function_name == "get_price": output = str(get_price(**kwargs)) elif function_name == "calculate": output = str(calculate(**kwargs)) else: raise ValueError messages.append({"role": "function", "name": function_name, "content": output}) print("*" * 80) print([m["role"] for m in messages]) print("*" * 80) print(messages[-1]["content"]) ================================================ FILE: mini_tutorials/rag_with_reranking/README.md ================================================ # Description ## Installation Run the following command to deploy a simple OpenSearch DB locally. ```bash docker run -p 9200:9200 -p 9600:9600 -e "DISABLE_SECURITY_PLUGIN=true" -e "discovery.type=single-node" --name opensearch-node -d opensearchproject/opensearch:latest ``` The version of the image was `2.10.0` at the time of making the video. To install the Python dependencies run ```bash pip install opensearch-py cohere ``` Again, I did not hardcode any version, but the versions at the time of making the video were ```bash cohere==4.27 opensearch-py==2.3.1 ``` ## Contents * `answer.py` - scripts that does RAG question answering - requires question as the only argument * `input.txt` - each line corresponds to a document to be added to OpenSearch(except for emtpy lines and comments) * `upload_data.py` - load `input.txt` into OpenSearch Note that to use the `answer.py` you need to get a Cohere API token and then export ```bash export COHERE_API_KEY=VERYSECRET python answer.py 'What is the meaning of life?' ``` ## Postman You can import the `postman_collection.json` in Postman and then simply add the following 3 variables in your environment * `OpenSearchURL` - will be `http://localhost:9200` if you follow the above instructions * `CohereURL` - should be `https://api.cohere.ai/v1` * `CohereAPIKey` - you need to generate this yourself # Diagrams ## RAG with embeddings rag-with-embeddings ## RAG with reranking rag-with-reranking ================================================ FILE: mini_tutorials/rag_with_reranking/answer.py ================================================ import os import sys import cohere from opensearchpy import OpenSearch # Helper def generate_prompt(question: str, contexts: str): prompt = ( "Given the following extracted parts of a long document and a " 'question, create a final answer with references ("SOURCES").' "If you don't know the answer, just say that you don't know, don't try " 'to make up an answer. ALWAYS return a "SOURCES" part in your answer.\n' ) prompt += f"QUESTION: {question}\n" prompt += "".join( [f"SOURCE {i}: {context}\n" for i, context in enumerate(contexts)] ) prompt += "ANSWER: " return prompt # PARAMETERS INDEX_NAME = "cool_index" FIELD_NAME = "stuff" RETRIEVER_K = 5 RERANKER_K = 2 COHERE_API_KEY = os.environ["COHERE_API_KEY"] question = sys.argv[1] # Instantiate clients os_client = OpenSearch( hosts=[ { "host": "localhost", "port": 9200, } ] ) cohere_client = cohere.Client(COHERE_API_KEY) # Retrieve os_results = os_client.search( body={ "query": { "match": { FIELD_NAME: question } } }, size=RETRIEVER_K ) contexts = [x["_source"][FIELD_NAME] for x in os_results["hits"]["hits"]] print("OpenSearch: ", contexts) # Rerank cohere_results = cohere_client.rerank( model="rerank-english-v2.0", query=question, documents=contexts, top_n=RERANKER_K, ) reranked_contexts = [r.document["text"] for r in cohere_results] print("Cohere Reranked: ", reranked_contexts) # Chat completion prompt = generate_prompt(question, reranked_contexts) response = cohere_client.chat( chat_history=[], message=prompt ) print("Answer: ", response.text) ================================================ FILE: mini_tutorials/rag_with_reranking/input.txt ================================================ # AGE AND FAVOURITE FOOD - 'What is the favourite food of Charles?', 'Who prefers vegetables the most?' Adam is older than Ben Ben is older then Charles Adam eats a lot of carrots Ben's favourite food is an apple Charles loves KFC Whatever, this sentence does not really contain anything super important # SPORTING EVENTS - 'What country managed to become world football champion after 2050'? Brazil won the Fifa World Cup in 2070 France is pretty good at football and won many championships Finland has won many ice hockey world cups Jamaica won the Athletics World Cup in 2055 Mexico won the Golf World Cup in 2050 ================================================ FILE: mini_tutorials/rag_with_reranking/postman_collection.json ================================================ { "info": { "name": "Retrieval augmented generation", "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json" }, "item": [ { "name": "OpenSearch", "item": [ { "name": "Get all indices", "request": { "method": "GET", "header": [], "url": { "raw": "{{OpenSearchURL}}/_cat/indices?v=true&s=index", "host": [ "{{OpenSearchURL}}" ], "path": [ "_cat", "indices" ], "query": [ { "key": "v", "value": "true" }, { "key": "s", "value": "index" } ] } }, "response": [] }, { "name": "Create index", "request": { "method": "PUT", "header": [], "body": { "mode": "raw", "raw": "{\n \"settings\": {\n \"index\": {\n \"number_of_shards\": 1,\n \"number_of_replicas\": 1\n }\n },\n \"mappings\": {\n \"properties\": {\n \"stuff\": {\n \"type\": \"text\"\n }\n }\n }\n}", "options": { "raw": { "language": "json" } } }, "url": { "raw": "{{OpenSearchURL}}/cool_index", "host": [ "{{OpenSearchURL}}" ], "path": [ "cool_index" ] } }, "response": [] }, { "name": "Delete index", "request": { "method": "DELETE", "header": [], "body": { "mode": "raw", "raw": "", "options": { "raw": { "language": "json" } } }, "url": { "raw": "{{OpenSearchURL}}/cool_index", "host": [ "{{OpenSearchURL}}" ], "path": [ "cool_index" ] } }, "response": [] }, { "name": "Add document", "request": { "method": "POST", "header": [], "body": { "mode": "raw", "raw": "{\n \"stuff\": \"This is just some document\"\n}", "options": { "raw": { "language": "json" } } }, "url": { "raw": "{{OpenSearchURL}}/cool_index/_doc", "host": [ "{{OpenSearchURL}}" ], "path": [ "cool_index", "_doc" ] } }, "response": [] }, { "name": "List all documents", "request": { "method": "POST", "header": [], "body": { "mode": "raw", "raw": "{\n \"query\": {\n \"match_all\": {}\n }\n}", "options": { "raw": { "language": "json" } } }, "url": { "raw": "{{OpenSearchURL}}/cool_index/_search", "host": [ "{{OpenSearchURL}}" ], "path": [ "cool_index", "_search" ] } }, "response": [] }, { "name": "Lexical (BM 25) search", "request": { "method": "POST", "header": [], "body": { "mode": "raw", "raw": "{\n \"query\": {\n \"match\": {\n \"stuff\": \"Some document\"\n }\n }\n}", "options": { "raw": { "language": "json" } } }, "url": { "raw": "{{OpenSearchURL}}/cool_index/_search", "host": [ "{{OpenSearchURL}}" ], "path": [ "cool_index", "_search" ] } }, "response": [] } ] }, { "name": "Cohere", "item": [ { "name": "Embed", "request": { "method": "POST", "header": [], "body": { "mode": "raw", "raw": "{\n \"texts\": [\n \"hello\",\n \"goodbye\"\n ],\n \"truncate\": \"END\"\n}", "options": { "raw": { "language": "json" } } }, "url": { "raw": "{{CohereURL}}/embed", "host": [ "{{CohereURL}}" ], "path": [ "embed" ] }, "description": "[https://docs.cohere.com/reference/embed](https://docs.cohere.com/reference/embed)" }, "response": [] }, { "name": "Rerank", "request": { "method": "POST", "header": [], "body": { "mode": "raw", "raw": "{\n \"return_documents\": false,\n \"max_chunks_per_doc\": 10,\n \"query\": \"What is the capital of the United States?\",\n \"documents\": [\n \"Carson City is the capital city of the American state of Nevada.\",\n \"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.\",\n \"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.\",\n \"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states.\"\n ]\n}", "options": { "raw": { "language": "json" } } }, "url": { "raw": "{{CohereURL}}/rerank", "host": [ "{{CohereURL}}" ], "path": [ "rerank" ] }, "description": "[https://docs.cohere.com/reference/embed](https://docs.cohere.com/reference/embed)" }, "response": [] }, { "name": "Chat", "request": { "method": "POST", "header": [], "body": { "mode": "raw", "raw": " {\n \"chat_history\": [\n {\"role\": \"USER\", \"message\": \"Who discovered gravity?\"},\n {\"role\": \"CHATBOT\", \"message\": \"The man who is widely credited with discovering gravity is Sir Isaac Newton\"}\n ],\n \"message\": \"What year was he born?\"\n }", "options": { "raw": { "language": "json" } } }, "url": { "raw": "{{CohereURL}}/chat", "host": [ "{{CohereURL}}" ], "path": [ "chat" ] }, "description": "" }, "response": [] } ], "auth": { "type": "bearer", "bearer": [ { "key": "token", "value": "{{CohereAPIKey}}", "type": "string" } ] }, "event": [ { "listen": "prerequest", "script": { "type": "text/javascript", "exec": [ "" ] } }, { "listen": "test", "script": { "type": "text/javascript", "exec": [ "" ] } } ] } ] } ================================================ FILE: mini_tutorials/rag_with_reranking/upload_data.py ================================================ from pathlib import Path from opensearchpy import OpenSearch INPUT_FILE = "input.txt" INDEX_NAME = "cool_index" FIELD_NAME = "stuff" client = OpenSearch( hosts=[ { "host": "localhost", "port": 9200, } ] ) print(client.ping()) with Path(INPUT_FILE).open() as f: i = 0 for line in f.read().splitlines(): if not line or line.startswith("#"): continue print(f"Adding {i}") client.index(index=INDEX_NAME, body={FIELD_NAME: line}) i += 1 ================================================ FILE: mini_tutorials/visualizing_activations_with_forward_hooks/src.py ================================================ import pathlib import torch import torch.nn.functional as F from torch.nn import Linear, Module from torch.utils.tensorboard import SummaryWriter class Network(Module): def __init__(self): super().__init__() self.fc_1 = Linear(10, 20) self.fc_2 = Linear(20, 30) self.fc_3 = Linear(30, 2) def forward(self, x): x = self.fc_1(x) x = self.fc_2(x) x = self.fc_3(x) x = F.relu(x) return x if __name__ == "__main__": log_dir = pathlib.Path.cwd() / "tensorboard_logs" writer = SummaryWriter(log_dir) x = torch.rand(1, 10) net = Network() def activation_hook(inst, inp, out): """Run activation hook. Parameters ---------- inst : torch.nn.Module The layer we want to attach the hook to. inp : tuple of torch.Tensor The input to the `forward` method. out : torch.Tensor The output of the `forward` method. """ print("Here") writer.add_histogram(repr(inst), out) handle_1 = net.fc_1.register_forward_hook(activation_hook) net.fc_2.register_forward_hook(activation_hook) net.fc_3.register_forward_hook(activation_hook) y = net(x) handle_1.remove() y = net(x)