Repository: 0xD4rky/Vision-Transformers Branch: main Commit: a4a489cfad34 Files: 13 Total size: 41.6 KB Directory structure: gitextract_fego_pme/ ├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt └── vit/ ├── readme.md ├── src/ │ ├── ViT.py │ ├── base.py │ ├── data.py │ ├── requirements.txt │ ├── trainer.py │ ├── utils.py │ └── vit_with_lora.py └── visualize/ └── vis.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/latest/usage/project/#working-with-version-control .pdm.toml .pdm-python .pdm-build/ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 Ishaan Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Zero-to-Hero: ViT🚀 I have tried to cover all the bases for understanding and implementing Vision Transformers (ViT) and their evolution into Video Vision Transformers (ViViT). The main focus is on dealing with the spatio-temporal relations using visual transformers. ![image](https://github.com/user-attachments/assets/bc8a2727-b33a-4681-aee6-c6b617e7ad81) ## 1. Vision Transformer (ViT) Fundamentals ### Surveys and Overviews: * [Transformers in Vision: A Survey](https://arxiv.org/abs/2101.01169) * [A Survey of Visual Transformers](https://arxiv.org/abs/2111.06091) * [Transformers in Vision](https://arxiv.org/abs/2101.01169) ### Key Papers: * An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale: [Paper](https://arxiv.org/abs/2010.11929) | [Code](https://github.com/google-research/vision_transformer) * Training data-efficient image transformers & distillation through attention (DeiT): [Paper](https://arxiv.org/abs/2012.12877) | [Code](https://github.com/facebookresearch/deit) ### Concepts and Tutorials: * "Attention Is All You Need": [Paper](https://arxiv.org/abs/1706.03762) * "The Illustrated Transformers": [Blog Post](http://jalammar.github.io/illustrated-transformer/) * "Vision Transformer Explained" [Blog Post](https://theaisummer.com/vision-transformer/) ## 2. Convolutional ViT and Hybrid Models: * CvT: Introducing Convolutions to Vision Transformers: [Paper](https://arxiv.org/abs/2103.15808) | [Code](https://github.com/microsoft/CvT) * CoAtNet: Marrying Convolution and Attention for All Data Sizes: [Paper](https://arxiv.org/abs/2106.04803) * ConViT: Improving Vision Transformers with Soft Convolutional Inductive Biases: [Paper](https://arxiv.org/abs/2103.10697) | [Code](https://github.com/facebookresearch/convit) ## 3. Efficient Transformers and Swin Transformer: * Swin Transformer: Hierarchical Vision Transformer using Shifted Windows: [Paper](https://arxiv.org/abs/2103.14030) | [Code](https://github.com/microsoft/Swin-Transformer) * Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions: [Paper](https://arxiv.org/abs/2102.12122) | [Code](https://github.com/whai362/PVT) * Efficient Transformers: A Survey: [Paper](https://arxiv.org/abs/2009.06732) ## 4. Space-Time Attention and Video Transformers: * TimeSformer: Is Space-Time Attention All You Need for Video Understanding? [Paper](https://arxiv.org/abs/2102.05095) | [Code](https://github.com/facebookresearch/TimeSformer) * Space-Time Mixing Attention for Video Transformer: [Paper](https://arxiv.org/abs/2106.05968) * MViT: Multiscale Vision Transformers: [Paper](https://arxiv.org/abs/2104.11227) | [Code](https://github.com/facebookresearch/SlowFast) ## 5. Video Vision Transformer (ViViT): * ViViT: A Video Vision Transformer: [Paper](https://arxiv.org/abs/2103.15691) | [Code](https://github.com/google-research/scenic/tree/main/scenic/projects/vivit) * Video Transformer Network: [Paper](https://arxiv.org/abs/2102.00719) | [Code](https://github.com/mx-mark/VideoTransformer-pytorch) ## How to use this Repo? * Start by reading the survey papers to get a broad understanding of the field. * For each key paper, read the abstract and introduction, then skim through the methodology and results sections. * Implement key concepts using the provided GitHub repositories or your own code. * Experiment with different architectures and datasets to solidify your understanding. * Use the additional resources to dive deeper into specific topics or applications. ================================================ FILE: requirements.txt ================================================ torch torchvision transformers timm matplotlib opencv-python plotly streamlit gradio flask ================================================ FILE: vit/readme.md ================================================ # Building ViT from scratch ## INFO: This project implements a Vision Transformer (ViT) from scratch using Python and PyTorch. The implementation is based on the original paper "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" by Dosovitskiy et al. The model is trained and evaluated on the CIFAR-10 dataset. ## Project Structure: The project consists of the following main files: * `base.py`: Contains the GELU activation function implementation. [Paper](https://arxiv.org/abs/1606.08415) | [Code](https://github.com/karfaoui/gelu) * `data.py`: Handles data preparation using the CIFAR-10 dataset. * `ViT.py`: Contains the Vision Transformer model implemented from scratch. * `trainer.py`: Implements the entire training and evaluation pipeline. * `utils.py`: Contains utility functions for model and checkpoint management. * visualization contains `vis.py` to visualize image patches and attention maps. ## Requirements: ``` cd proj/src pip install -r requirements.txt ``` ## Inference: 1. Clone the repo: ``` git clone https://github.com/0xD4rky/Vision-Transformer.git cd proj/src ``` 2. Prepare the data: The `data.py` script handles the CIFAR-10 dataset preparation. You don't need to run this separately as it will be called by the trainer. 3. Training: ``` python trainer.py ``` This script will train the Vision Transformer on the CIFAR-10 dataset and evaluate its performance. ## Model Architecture The Vision Transformer (ViT) architecture is implemented in `ViT.py`. It follows the original paper's design, including: * Patch embedding * Positional embedding * Transformer encoder with multi-head self-attention and feed-forward layers * Classification head ## Training and Evaluation: The `trainer.py` script handles both training and evaluation. It includes: * Data loading and preprocessing * Model initialization * Training loop with gradient updates * Evaluation on the test set * Logging of training progress and results ## Utility Functions: The `utils.py` file contains helper functions for: * Saving and loading model checkpoints * Logging training progress * Any other utility functions used across the project ## Results: (You can add information about the performance of your model on the CIFAR-10 dataset, including accuracy, training time, and any comparisons with baseline models.) `VISUALIZATION`: ### 1. Image Patches: ![Screenshot from 2024-10-11 22-51-58](https://github.com/user-attachments/assets/fa1673ca-b0fe-46ca-917b-ce9268eb4510) ### 2. Feature Maps: ![Screenshot from 2024-10-11 22-52-21](https://github.com/user-attachments/assets/ea7bac63-a9c7-4d7f-88b3-fab45a84d766) ================================================ FILE: vit/src/ViT.py ================================================ from base import * class PatchEmbeddings(nn.Module): """ Convert the image into patches and then project them into a vector space. """ def __init__(self, config): super().__init__() self.image_size = config["image_size"] self.patch_size = config["patch_size"] self.num_channels = config["num_channels"] self.vector_dim = config["vector_dim"] self.num_patches = (self.image_size // self.patch_size) ** 2 self.projection = nn.Conv2d(self.num_channels, self.vector_dim, kernel_size=self.patch_size, stride=self.patch_size) def forward(self, x): # {batch_size, num_channels, image_size, image_size}-> {batch_size, num_patches, vector_dim} x = self.projection(x) x = x.flatten(2).transpose(1, 2) return x class Embeddings(nn.Module): """ adding positional information to extracted patch embeddings """ def __init__(self,config): self.config = config self.patch_emb = PatchEmbeddings(config) self.cls_token = nn.Parameter(torch.randn(1,1,config["vector_dim"])) # create learnable positional encoding and add +1 dim for [CLS] self.positional_encoding = nn.Parameter(torch.randn(1,self.patch_emb.num_patches + 1, config["vector_dim"])) self.droput = nn.Dropout(config["droput_prob"]) def forward(self,x): x = self.patch_emb(x) batch_size, _, _ = x.size() # expand the [cls] token to batch size #{1,1,vector_dim} -> (batch_size,1,hidden_size) cls_tokens = self.cls_token.expand(batch_size,-1,-1) """ concatenating cls token to inputn sequence size : {num_patches + 1} """ x = torch.cat((cls_tokens,x),dim = 1) x = x + self.positional_encoding return x class Attention(nn.Module): """ Attention module Will be used in: Multi-headed-attention Module """ def __init__(self,vector_dim,attention_head_size,dropout,bias = True): super().__init__() self.vector_dim = vector_dim self.attention_head_size = attention_head_size self.dropout = nn.Dropout(dropout) # {query,key,value} self.query = nn.Linear(vector_dim,attention_head_size, bias = bias) self.key = nn.Linear(vector_dim, attention_head_size,bias = bias) self.value = nn.Linear(vector_dim,attention_head_size,bias = bias) def forward(self,x): query = self.query(x) key = self.key(x) value = self.value(x) # i have them in matrix form similarity = torch.matmul(query,key.transpose(-1,-2)) attention_probs = nn.functional.softmax((similarity/math.sqrt(self.attention_head_size)),dim = 1) attention_probs = self.dropout(attention_probs) output = torch.matmul(attention_probs,value) return output,attention_probs class MultiheadAttention(nn.Module): """ Multi-headed-attention module Will be used in: Transformer Encoder """ def __init__(self,config): super().__init() self.vector_dim = config["vector_dim"] self.num_attention_heads = config["num_attention_heads"] self.attention_head_size = self.vector_sim // self.num_attention_heads self.all_head_size = self.num_attention_heads * self.attention_head_size self.qkv_bias = config["qkv_bias"] #creating a list of attention heads self.heads = nn.ModuleList([]) for _ in range(self.num_attention_heads): head = Attention( self.vector_dim, self.attention_head_size, config["attention_probs_dropout_prob"], self.qkv_bias ) self.heads.append(head) # project attention output back to vector dim self.output_projection = nn.Linear(self.all_head_size,self.vector_dim) self.output_dropout = nn.Dropout(config["hidden_dropout_prob"]) def forward(self,x,output_attentions = False): attention_outputs = [head(x) for head in self.heads] # for each attention head attention_output = torch.cat([attention_output for attention_output, _ in attention_outputs],dim=-1) # Project the concatenated attention output back to the hidden size attention_output = self.output_projection(attention_output) attention_output = self.output_dropout(attention_output) # Return the attention output and the attention probabilities (optional) if not output_attentions: return (attention_output, None) else: attention_probs = torch.stack([attention_probs for _, attention_probs in attention_outputs], dim=1) return (attention_output, attention_probs) class MLP(nn.Module): """ Multi-Layer Perceptron Module """ def __init__(self,config): super().__init__() self.dense_1 = nn.Linear(config["vector_dim"],config["hidden_size"]) self.act = NewGELUActivation() self.dense_2 = nn.Linear(config["hidden_size"],config["vector_dim"]) self.dropout = nn.Dropout(config["hidden_dropout_prob"]) def forward(self,x): x = self.dense_1(x) x = self.act(x) x = self.dense_2(x) x = self.dropout(x) return x class Block(nn.Module): """ Single transformer block """ def __init__(self,config): super().__init__() self.attention = MultiheadAttention(config) self.layer_norm1 = nn.LayerNorm(config["vector_dim"]) self.mlp = MLP(config) self.layernorm_2 = nn.LayerNorm(config["vector_dim"]) def forward(self,x,output_attentions = False): # {self-attention after normalizing layers} attention_output, attention_prob = self.attention(self.layer_norm1(x),output_attentions=output_attentions) x = x + attention_output # {skip-connections}\ mlp_output = self.mlp(self.layer_norm2(x)) #{ffn} x = x + mlp_output if not output_attentions: return (x,None) else: return (x,attention_prob) class Encoder(nn.Module): def __init__(self,config): super().__init__() self.blocks = nn.ModuleList([]) for _ in range(config["num_hidden_layers"]): block = Block(config) self.blocks.append(block) def forward(self, x, output_attentions=False): # Calculate the transformer block's output for each block all_attentions = [] for block in self.blocks: x, attention_probs = block(x, output_attentions=output_attentions) if output_attentions: all_attentions.append(attention_probs) # Return the encoder's output and the attention probabilities (optional) if not output_attentions: return (x, None) else: return (x, all_attentions) class Classification(nn.Module): """ ViT model for classification """ def __init__(self,config): super().__init__() self.config = config self.img_size = config["img_size"] self.vector_dim = config["vector_dim"] self.num_classes = config["num_classes"] # follow the below pipepline :) self.embeddings = Embeddings(config) self.encoder = Encoder(config) self.classifier = nn.Linear(self.vector_dim,self.num_classes) self.apply(self._init_weights) def forward(self, x, output_attentions=False): # Calculate the embedding output embedding_output = self.embedding(x) # Calculate the encoder's output encoder_output, all_attentions = self.encoder(embedding_output, output_attentions=output_attentions) # Calculate the logits, take the [CLS] token's output as features for classification logits = self.classifier(encoder_output[:, 0, :]) # Return the logits and the attention probabilities (optional) if not output_attentions: return (logits, None) else: return (logits, all_attentions) def _init_weights(self, module): if isinstance(module, (nn.Linear, nn.Conv2d)): torch.nn.init.normal_(module.weight, mean=0.0, std=self.config["initializer_range"]) if module.bias is not None: torch.nn.init.zeros_(module.bias) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, Embeddings): module.position_embeddings.data = nn.init.trunc_normal_( module.position_embeddings.data.to(torch.float32), mean=0.0, std=self.config["initializer_range"], ).to(module.position_embeddings.dtype) ================================================ FILE: vit/src/base.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import torchvision import torchvision.datasets as datasets import torch.optim as optim from torch.utils.data import DataLoader import numpy as np import matplotlib.pyplot as plt import os from PIL import Image import math class NewGELUActivation(nn.Module): """ Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 Taken from https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py """ def forward(self, input): return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0)))) ================================================ FILE: vit/src/data.py ================================================ # Import libraries import torch import torchvision import torchvision.transforms as transforms def prepare_data(batch_size=4, num_workers=2, train_sample_size=None, test_sample_size=None): train_transform = transforms.Compose( [transforms.ToTensor(), transforms.Resize((32, 32)), transforms.RandomHorizontalFlip(p=0.5), transforms.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.75, 1.3333333333333333), interpolation=2), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform) if train_sample_size is not None: # Randomly sample a subset of the training set indices = torch.randperm(len(trainset))[:train_sample_size] trainset = torch.utils.data.Subset(trainset, indices) trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers) test_transform = transforms.Compose( [transforms.ToTensor(), transforms.Resize((32, 32)), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform) if test_sample_size is not None: # Randomly sample a subset of the test set indices = torch.randperm(len(testset))[:test_sample_size] testset = torch.utils.data.Subset(testset, indices) testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') return trainloader, testloader, classes ================================================ FILE: vit/src/requirements.txt ================================================ python-apt==2.7.7+ubuntu3 python-dateutil==2.8.2 python-debian==0.1.49+ubuntu2 python-multipart==0.0.12 torch==2.4.1 torchaudio==2.4.1 torchvision==0.19.1 numpy==1.26.4 ================================================ FILE: vit/src/trainer.py ================================================ import torch import torch.nn as nn import argparse from data import * from utils import save_checkpoint, save_experiment from vit_base import Classification config = { "patch_size": 4, # Input image size: 32x32 -> 8x8 patches "vector_dim": 48, "num_hidden_layers": 4, "num_attention_heads": 4, "hidden_size": 4 * 48, # 4 * hidden_size "hidden_dropout_prob": 0.0, "attention_probs_dropout_prob": 0.0, "initializer_range": 0.02, "image_size": 32, "num_classes": 10, # num_classes of CIFAR10 "num_channels": 3, "qkv_bias": True, } assert config["vector_dim"] % config["num_attention_heads"] == 0 assert config['hidden_size'] == 4 * config['vector_dim'] assert config['image_size'] % config['patch_size'] == 0 class Trainer: """ simple trainer block """ def __init__(self,model,optimizer,loss_fn,exp_name,device): self.model = model.to(device) self.optim = optimizer self.loss = loss_fn self.exp_name = exp_name self.device = device def train(self,train_loader,test_loader,epochs,save_exp_every_n_epochs = 0): train_losses, test_losses, accuracies = [],[],[] for i in range(epochs): train_loss = self.train_epoch(train_loader) accuracy = test_loss = self.evaluate(test_loader) train_losses.append(train_loss) test_losses.append(test_loss) accuracies.append(accuracy) print(f"Epoch {i+1}, Train loss: {train_loss:.4f}, Test loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}") if save_exp_every_n_epochs > 0 and (i+1) % save_exp_every_n_epochs == 0 and i+1 != epochs: print('\tSave checkpoint at epoch',i+1) save_checkpoint(self.exp_name, self.model, i+1) save_experiment(self.exp_name, self.model, i+1) def train_epoch(self,train_loader): self.model.train() total_loss = 0 for batch in train_loader: batch = [t.tp(self.device) for t in batch] images, labels = batch self.optimizer.zero_grad() loss = self.loss(self.model(images)[0], labels) loss.backward() self.optimizer.step() total_loss += loss.item()*len(images) return total_loss/ len(train_loader.dataset) @torch.no_grad() def evaluate(self,test_loader): self.model.eval() total_loss = 0 correct = 0 with torch.no_grad(): for batch in test_loader(): batch = [t.to(self.device) for t in batch] images, labels = batch logits,_ = self.model(images) loss = self.loss(logits,labels) total_loss += loss.item() * len(images) predictions = torch.argmax(logits, dim = 1) correct = torch.sum(predictions == labels).item() accuracy = correct/ len(test_loader.dataset) avg_loss = total_loss / len(test_loader.dataset) return accuracy, avg_loss def parse_args(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--exp-name", type = str, required = True) parser.add_argument("--batch-size", type = int, default = 256) parser.add_argument("--epochs", type=int, default=100) parser.add_argument("--lr", type=float, default=1e-2) parser.add_argument("--device", type=str) parser.add_argument("--save-model-every", type=int, default=0) args = parser.parse_args() if args.device is None: args.device = "cuda" if torch.cuda.is_available() else "cpu" return args def main(): args = parse_args() batch_size = args.batch_size epochs = args.epochs lr = args.lr device = args.device save_exp_every_n_epochs = args.save_model_every trainloader, testloader = prepare_data(batch_size = batch_size) model = Classification(config) """ IF YOU WANT TO USE LORA TRAINING, UNCOMMENT THE BELOW LINES def create_model(): model = Classification(config) if config["use_lora"]: model = prepare_model_for_lora_training(model) return model """ optimizer = torch.optim.AdamW(model.parameters(), lr = lr, weight_decay = 1e-2) loss_fn = nn.CrossEntropyLoss() trainer = Trainer(model, optimizer, loss_fn, args.exp_name, device = device) trainer.train(trainloader, testloader, epochs, save_exp_every_n_epochs = save_exp_every_n_epochs) if __name__ == "__main__": main() ================================================ FILE: vit/src/utils.py ================================================ import json, os, math import matplotlib.pyplot as plt import numpy as np import torch from torch.nn import functional as F import torchvision import torchvision.transforms as transforms from ViT import Classification def save_experiment(experiment_name,config,model,train_losses,test_losses,accuracies,base_dir = "experiments"): outdir = os.path.join(base_dir,experiment_name) os.makedirs(outdir, exist_ok = True) configfile = os.path.join(outdir,'config.json') with open(configfile, 'w') as f: json.dump(config,f,sort_keys = True,indent = 4) jsonfile = os.path.join(outdir, 'metrics.json') with open(jsonfile, 'w') as f: data = { 'train_losses': train_losses, 'test_losses': test_losses, 'accuracies': accuracies, } json.dump(data, f, sort_keys=True, indent=4) save_checkpoint(experiment_name,model,"final",base_dir = base_dir) def save_checkpoint(experiment_name, model, epoch, base_dir="experiments"): outdir = os.path.join(base_dir, experiment_name) os.makedirs(outdir, exist_ok=True) cpfile = os.path.join(outdir, f'model_{epoch}.pt') torch.save(model.state_dict(), cpfile) def load_experiment(experiment_name,checkpoint_name="model_final.pt",base_dir = "experiments"): outdir = os.path.join(base_dir,experiment_name) configfile = os.path.join(outdir,'config.json') with open(configfile,'r') as f: config = json.load(f) jsonfile = os.path.join(outdir, 'config.json') with open(jsonfile,'r') as f: data = json.load(f) train_losses = data['train_losses'] test_losses = data['test_losses'] accuracies = data['accuracies'] # Load the model model = Classfication(config) cpfile = os.path.join(outdir, checkpoint_name) model.load_state_dict(torch.load(cpfile)) return config, model, train_losses, test_losses, accuracies ================================================ FILE: vit/src/vit_with_lora.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import math from base import * class LoRALayer(nn.Module): """Low-Rank Adaptation layer""" def __init__(self, in_features, out_features, rank=4, alpha=16): super(LoRALayer,self).__init__() self.rank = rank self.scaling = alpha / rank self.lora_A = nn.Parameter(torch.zeros(in_features, rank)) self.lora_B = nn.Parameter(torch.zeros(rank, out_features)) nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) nn.init.zeros_(self.lora_B) def forward(self, x): x = x @ (self.lora_A @ self.lora_B) * self.scaling return x class PatchEmbeddings(nn.Module): """ Convert the image into patches and then project them into a vector space. """ def __init__(self, config): super().__init__() self.image_size = config["image_size"] self.patch_size = config["patch_size"] self.num_channels = config["num_channels"] self.vector_dim = config["vector_dim"] self.num_patches = (self.image_size // self.patch_size) ** 2 self.projection = nn.Conv2d(self.num_channels, self.vector_dim, kernel_size=self.patch_size, stride=self.patch_size) def forward(self, x): # {batch_size, num_channels, image_size, image_size}-> {batch_size, num_patches, vector_dim} x = self.projection(x) x = x.flatten(2).transpose(1, 2) return x class Embeddings(nn.Module): """ adding positional information to extracted patch embeddings """ def __init__(self,config): self.config = config self.patch_emb = PatchEmbeddings(config) self.cls_token = nn.Parameter(torch.randn(1,1,config["vector_dim"])) self.positional_encoding = nn.Parameter(torch.randn(1,self.patch_emb.num_patches + 1, config["vector_dim"])) self.droput = nn.Dropout(config["droput_prob"]) def forward(self,x): x = self.patch_emb(x) batch_size, _, _ = x.size() # expand the [cls] token to batch size #{1,1,vector_dim} -> (batch_size,1,hidden_size) cls_tokens = self.cls_token.expand(batch_size,-1,-1) """ concatenating cls token to inputn sequence size : {num_patches + 1} """ x = torch.cat((cls_tokens,x),dim = 1) x = x + self.positional_encoding return x class Attention(nn.Module): """ Attention module with LoRA Support """ def __init__(self,vector_dim,attention_head_size,dropout,bias=True, use_lora=False, lora_rank=8, lora_alpha=16): super().__init__() self.vector_dim = vector_dim self.attention_head_size = attention_head_size self.dropout = nn.Dropout(dropout) self.use_lora = use_lora self.query = nn.Linear(vector_dim, attention_head_size, bias = bias) self.key = nn.Linear(vector_dim, attention_head_size, bias = bias) self.value = nn.Linear(vector_dim, attention_head_size, bias = bias) if use_lora: self.lora_q = LoRALayer(vector_dim, attention_head_size, lora_rank, lora_alpha) self.lora_v = LoRALayer(vector_dim, attention_head_size, lora_rank, lora_alpha) def forward(self, x): q = self.query(x) key = self.key(x) v = self.value(x) if self.use_lora: query = q + self.lora_q(x) value = v + self.lora_v(x) similarity = torch.matmul(query, key.transpose(-1,-2)) attention_probs = F.softmax((similarity/math.sqrt(self.attention_head_size)),dim = 1) attention_probs = self.dropout(attention_probs) output = torch.matmul(attention_probs, value) return output, attention_probs class MultiheadAttention(nn.Module): """ Multi-headed-attention module with LoRA support """ def __init__(self, config): super().__init__() self.vector_dim = config["vector_dim"] self.num_attention_heads = config["num_attention_heads"] self.attention_head_size = self.vector_dim // self.num_attention_heads self.all_head_size = self.num_attention_heads * self.attention_head_size self.qkv_bias = config["qkv_bias"] self.use_lora = config.get("use_lora", False) self.lora_rank = config.get("lora_rank", 8) self.lora_alpha = config.get("lora_alpha", 16) self.heads = nn.ModuleList([ Attention( self.vector_dim, self.attention_head_size, config["attention_probs_dropout_prob"], self.qkv_bias, self.use_lora, self.lora_rank, self.lora_alpha ) for _ in range(self.num_attention_heads) ]) self.output_projection = nn.Linear(self.all_head_size, self.vector_dim) self.output_dropout = nn.Dropout(config["hidden_dropout_prob"]) def forward(self, x, output_attentions=False): attention_outputs = [head(x) for head in self.heads] attention_output = torch.cat( [attention_output for attention_output, _ in attention_outputs], dim=-1 ) attention_output = self.output_projection(attention_output) attention_output = self.output_dropout(attention_output) if not output_attentions: return (attention_output, None) attention_probs = torch.stack( [attention_probs for _, attention_probs in attention_outputs], dim=1 ) return (attention_output, attention_probs) class MLP(nn.Module): """ Multi-Layer Perceptron Module with LoRA support """ def __init__(self, config): super().__init__() self.use_lora = config.get("use_lora", False) self.lora_rank = config.get("lora_rank", 8) self.lora_alpha = config.get("lora_alpha", 16) self.dense_1 = nn.Linear(config["vector_dim"], config["hidden_size"]) self.dense_2 = nn.Linear(config["hidden_size"], config["vector_dim"]) if self.use_lora: self.lora_1 = LoRALayer( config["vector_dim"], config["hidden_size"], self.lora_rank, self.lora_alpha ) self.lora_2 = LoRALayer( config["hidden_size"], config["vector_dim"], self.lora_rank, self.lora_alpha ) self.act = NewGELUActivation() self.dropout = nn.Dropout(config["hidden_dropout_prob"]) def forward(self, x): hidden = self.dense_1(x) if self.use_lora: hidden = hidden + self.lora_1(x) hidden = self.act(hidden) output = self.dense_2(hidden) if self.use_lora: output = output + self.lora_2(hidden) output = self.dropout(output) return output def prepare_mlp_for_lora_training(model): """Freeze all parameters except LoRA parameters""" for name, param in model.named_parameters(): if 'lora' not in name: param.requires_grad = False else: param.requires_grad = True return model class Block(nn.Module): "single transformer block with LoRA support" def __init__(self, config): super().__init_() self.attention = MultiheadAttention(config) self.layer_norm1 = nn.LayerNorm(config["vector_dim"]) self.mlp = MLP(config) self.layer_norm2 = nn.LayerNorm(config["vector_dim"]) def forward(self, x, output_attentions = False): attention_output, attention_probs = self.attention(self.layer_norm1(x), output_attentions=output_attentions) x = x + attention_output mlp_output = self.mlp(self.layer_norm2(x)) x = x + mlp_output if not output_attentions: return (x, None) else: return (x, attention_probs) class Encoder(nn.Module): """ Transformer encoder with LoRA support """ def __init__(self, config): super().__init__() self.blocks = nn.ModuleList([ Block(config) for _ in range(config["num_hidden_layers"]) ]) def forward(self, x, output_attentions=False): all_attentions = [] for block in self.blocks: x, attention_probs = block(x, output_attentions=output_attentions) if output_attentions: all_attentions.append(attention_probs) if not output_attentions: return (x, None) else: return (x, all_attentions) class LoRALinear(nn.Module): """ Linear layer with LoRA support for classification head """ def __init__(self, in_features, out_features, rank=8, alpha=16): super().__init__() self.linear = nn.Linear(in_features, out_features) self.lora = LoRALayer(in_features, out_features, rank, alpha) def forward(self, x): return self.linear(x) + self.lora(x) class Classification(nn.Module): """ ViT model for classification with LoRA support """ def __init__(self, config): super().__init__() self.config = config self.img_size = config["img_size"] self.vector_dim = config["vector_dim"] self.num_classes = config["num_classes"] # Initialize components self.embeddings = Embeddings(config) self.encoder = Encoder(config) # Use LoRA for classifier if enabled if config.get("use_lora", False): self.classifier = LoRALinear( self.vector_dim, self.num_classes, config.get("lora_rank", 8), config.get("lora_alpha", 16) ) else: self.classifier = nn.Linear(self.vector_dim, self.num_classes) self.apply(self._init_weights) def forward(self, x, output_attentions=False): embedding_output = self.embeddings(x) encoder_output, all_attentions = self.encoder( embedding_output, output_attentions=output_attentions ) # Use CLS token for classification logits = self.classifier(encoder_output[:, 0, :]) if not output_attentions: return (logits, None) else: return (logits, all_attentions) def _init_weights(self, module): if isinstance(module, (nn.Linear, nn.Conv2d)): torch.nn.init.normal_( module.weight, mean=0.0, std=self.config["initializer_range"] ) if module.bias is not None: torch.nn.init.zeros_(module.bias) elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, Embeddings): module.position_embeddings.data = nn.init.trunc_normal_( module.position_embeddings.data.to(torch.float32), mean=0.0, std=self.config["initializer_range"], ).to(module.position_embeddings.dtype) def prepare_model_for_lora_training(model): """ Prepare the model for LoRA training by freezing non-LoRA parameters """ for name, param in model.named_parameters(): if 'lora' not in name: param.requires_grad = False else: param.requires_grad = True return model ================================================ FILE: vit/visualize/vis.py ================================================ import torch import torch.nn as nn import matplotlib.pyplot as plt import torchvision.transforms as T from torchvision.utils import make_grid class PatchEmbedding(nn.Module): def __init__(self, num_patches, vector_dim, patch_size): super(PatchEmbedding, self).__init__() self.conv = nn.Conv2d(3, vector_dim, kernel_size=patch_size, stride=patch_size) def forward(self, x): x = self.conv(x) return x input_image = torch.randn(1, 3, 224, 224) # {creating a dummy image to vis} vector_dim = 256 patch_size = 16 patch_embedding = PatchEmbedding(3, vector_dim, patch_size) output = patch_embedding(input_image) def visualize_patches(input_image, patch_size): """ visualizing patches and attention maps """ input_image = input_image.squeeze(0).permute(1, 2, 0).numpy() fig, ax = plt.subplots() ax.imshow(input_image) for i in range(0, input_image.shape[0], patch_size): ax.axhline(i, color='red') for j in range(0, input_image.shape[1], patch_size): ax.axvline(j, color='red') plt.title("Input Image with Patches") plt.show() visualize_patches(input_image, patch_size) def visualize_feature_maps(feature_maps, num_maps_to_show=8): maps_to_show = feature_maps[0, :num_maps_to_show, :, :] grid = make_grid(maps_to_show.unsqueeze(1), nrow=4, normalize=True, scale_each=True) plt.figure(figsize=(15, 15)) plt.imshow(grid.permute(1, 2, 0).cpu().numpy()) plt.title("Feature Maps") plt.axis('off') plt.show() visualize_feature_maps(output, num_maps_to_show=8)