Repository: Lednik7/CLIP-ONNX Branch: main Commit: ebd4852b7d3e Files: 19 Total size: 284.9 KB Directory structure: gitextract_k9qaok4o/ ├── .gitignore ├── LICENSE ├── README.md ├── benchmark.md ├── clip_onnx/ │ ├── __init__.py │ ├── benchmark.py │ ├── clip_converter.py │ ├── clip_onnx.py │ └── utils.py ├── examples/ │ ├── RuCLIP_onnx_example.ipynb │ ├── clip_onnx_example.ipynb │ ├── dev/ │ │ ├── clip_onnx_benchmark_cpu.ipynb │ │ ├── clip_onnx_benchmark_gpu.ipynb │ │ ├── clip_onnx_benchmark_gpu_K80.ipynb │ │ └── clip_onnx_benchmark_gpu_T4.ipynb │ ├── readme_example.ipynb │ └── ru_CLIP_tiny_onnx.ipynb ├── requirements.txt └── setup.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2022 Gerasimov Maxim Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # CLIP-ONNX It is a simple library to speed up CLIP inference up to 3x (K80 GPU)! [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Lednik7/CLIP-ONNX/blob/main/examples/readme_example.ipynb) Open AI CLIP [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Lednik7/CLIP-ONNX/blob/main/examples/RuCLIP_onnx_example.ipynb) RuCLIP Example [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Lednik7/CLIP-ONNX/blob/main/examples/ru_CLIP_tiny_onnx.ipynb) RuCLIP tiny Example ## Usage Install clip-onnx module and requirements first. Use this trick ```python3 !pip install git+https://github.com/Lednik7/CLIP-ONNX.git !pip install git+https://github.com/openai/CLIP.git !pip install onnxruntime-gpu ``` ## Example in 3 steps 0. Download CLIP image from repo ```python3 !wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true ``` 1. Load standard CLIP model, image, text on cpu ```python3 import clip from PIL import Image import numpy as np # onnx cannot work with cuda model, preprocess = clip.load("ViT-B/32", device="cpu", jit=False) # batch first image = preprocess(Image.open("CLIP.png")).unsqueeze(0).cpu() # [1, 3, 224, 224] image_onnx = image.detach().cpu().numpy().astype(np.float32) # batch first text = clip.tokenize(["a diagram", "a dog", "a cat"]).cpu() # [3, 77] text_onnx = text.detach().cpu().numpy().astype(np.int32) ``` 2. Create CLIP-ONNX object to convert model to onnx ```python3 from clip_onnx import clip_onnx visual_path = "clip_visual.onnx" textual_path = "clip_textual.onnx" onnx_model = clip_onnx(model, visual_path=visual_path, textual_path=textual_path) onnx_model.convert2onnx(image, text, verbose=True) # ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'] onnx_model.start_sessions(providers=["CPUExecutionProvider"]) # cpu mode ``` 3. Use for standard CLIP API. Batch inference ```python3 image_features = onnx_model.encode_image(image_onnx) text_features = onnx_model.encode_text(text_onnx) logits_per_image, logits_per_text = onnx_model(image_onnx, text_onnx) probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy() print("Label probs:", probs) # prints: [[0.9927937 0.00421067 0.00299571]] ``` **Enjoy the speed** ## Load saved model Example for ViT-B/32 from Model Zoo ```python3 !wget https://clip-as-service.s3.us-east-2.amazonaws.com/models/onnx/ViT-B-32/visual.onnx !wget https://clip-as-service.s3.us-east-2.amazonaws.com/models/onnx/ViT-B-32/textual.onnx ``` ```python3 onnx_model = clip_onnx(None) onnx_model.load_onnx(visual_path="visual.onnx", textual_path="textual.onnx", logit_scale=100.0000) # model.logit_scale.exp() onnx_model.start_sessions(providers=["CPUExecutionProvider"]) ``` ## Model Zoo Models of the original CLIP can be found on this [page](https://github.com/jina-ai/clip-as-service/blob/main/server/clip_server/model/clip_onnx.py).\ They are not part of this library but should work correctly. ## If something doesn't work It happens that onnx does not convert the model the first time, in these cases it is worth trying to run it again. If it doesn't help, it makes sense to change the export settings. Model export options in onnx looks like this: ```python3 DEFAULT_EXPORT = dict(input_names=['input'], output_names=['output'], export_params=True, verbose=False, opset_version=12, do_constant_folding=True, dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}) ``` You can change them pretty easily. ```python3 from clip_onnx.utils import DEFAULT_EXPORT DEFAULT_EXPORT["opset_version"] = 15 ``` Alternative option (change only visual or textual): ```python3 from clip_onnx import clip_onnx from clip_onnx.utils import DEFAULT_EXPORT visual_path = "clip_visual.onnx" textual_path = "clip_textual.onnx" textual_export_params = DEFAULT_EXPORT.copy() textual_export_params["dynamic_axes"] = {'input': {1: 'batch_size'}, 'output': {0: 'batch_size'}} textual_export_params["opset_version"] = 12 Textual = lambda x: x onnx_model = clip_onnx(model.cpu(), visual_path=visual_path, textual_path=textual_path) onnx_model.convert2onnx(dummy_input_image, dummy_input_text, verbose=True, textual_wrapper=Textual, textual_export_params=textual_export_params) ``` ## Best practices See [benchmark.md](https://github.com/Lednik7/CLIP-ONNX/tree/main/benchmark.md) ## Examples See [examples folder](https://github.com/Lednik7/CLIP-ONNX/tree/main/examples) for more details \ Some parts of the code were taken from the [post](https://twitter.com/apeoffire/status/1478493291008172038). Thank you [neverix](https://github.com/neverix) for this notebook. ================================================ FILE: benchmark.md ================================================ # CPU benchmarks #### Run on Intel (R) Xeon (R) CPU @ 2.30 GHz with 2 cores (Google Colab session) | ONNX | batch | encode_image | encode_text | total | |:---------|--------:|---------------:|--------------:|--------:| | ViT-B/32 | 2 | 0.234 | 0.162 | 0.396 | | ViT-B/32 | 8 | 0.923 | 0.656 | 1.579 | | ViT-B/32 | 16 | 2.079 | 1.288 | 3.367 | | ViT-B/32 | 32 | 3.937 | 2.658 | 6.595 | | ViT-B/32 | 64 | 7.944 | 5.567 | 13.511 | | TORCH | batch | encode_image | encode_text | total | |:---------|--------:|---------------:|--------------:|--------:| | ViT-B/32 | 2 | 0.343 | 0.243 | 0.586 | | ViT-B/32 | 8 | 1.093 | 0.831 | 1.924 | | ViT-B/32 | 16 | 1.952 | 1.523 | 3.475 | | ViT-B/32 | 32 | 4.079 | 3.015 | 7.094 | | ViT-B/32 | 64 | 8.07 | 6.212 | 14.282 | # GPU benchmarks #### Run on NVIDIA Tesla K80 (Google Colab session) | ONNX | batch | encode_image | encode_text | total | |:---------|--------:|---------------:|--------------:|--------:| | ViT-B/32 | 2 | 0.136 | 0.021 | 0.157 | | ViT-B/32 | 8 | 0.054 | 0.04 | 0.094 | | ViT-B/32 | 16 | 0.089 | 0.071 | 0.16 | | ViT-B/32 | 32 | 0.158 | 0.134 | 0.292 | | ViT-B/32 | 64 | 0.325 | 0.258 | 0.583 | | TORCH | batch | encode_image | encode_text | total | |:---------|--------:|---------------:|--------------:|--------:| | ViT-B/32 | 2 | 0.02 | 0.035 | 0.055 | | ViT-B/32 | 8 | 0.081 | 0.098 | 0.179 | | ViT-B/32 | 16 | 0.207 | 0.196 | 0.403 | | ViT-B/32 | 32 | 0.44 | 0.374 | 0.814 | | ViT-B/32 | 64 | 0.919 | 0.719 | 1.638 | #### Run on NVIDIA Tesla T4 (Google Colab session) | ONNX | batch | encode_image | encode_text | total | |:---------|--------:|---------------:|--------------:|--------:| | ViT-B/32 | 2 | 0.155 | 0.01 | 0.165 | | ViT-B/32 | 8 | 0.032 | 0.014 | 0.046 | | ViT-B/32 | 16 | 0.037 | 0.029 | 0.066 | | ViT-B/32 | 32 | 0.076 | 0.059 | 0.135 | | ViT-B/32 | 64 | 0.169 | 0.117 | 0.286 | | TORCH | batch | encode_image | encode_text | total | |:---------|--------:|---------------:|--------------:|--------:| | ViT-B/32 | 2 | 0.017 | 0.009 | 0.026 | | ViT-B/32 | 8 | 0.008 | 0.008 | 0.016 | | ViT-B/32 | 16 | 0.009 | 0.012 | 0.021 | | ViT-B/32 | 32 | 0.008 | 0.025 | 0.033 | | ViT-B/32 | 64 | 0.009 | 0.049 | 0.058 | ================================================ FILE: clip_onnx/__init__.py ================================================ from .clip_converter import clip_converter from .clip_onnx import clip_onnx from .utils import Textual, attention from .benchmark import speed_test ================================================ FILE: clip_onnx/benchmark.py ================================================ import time import torch def speed_test(func, data_gen, n: int = 5, empty_cache: bool = True): if empty_cache: torch.cuda.empty_cache() values = [] for _ in range(n): input_data = data_gen() t = time.time() func(input_data) values.append(time.time() - t) if empty_cache: torch.cuda.empty_cache() return sum(values) / n ================================================ FILE: clip_onnx/clip_converter.py ================================================ import torch import onnx from torch import nn from onnxruntime.quantization import quantize_dynamic, QuantType from .utils import Textual, DEFAULT_EXPORT class clip_converter(nn.Module): def __init__(self, model, visual_path: str = "clip_visual.onnx", textual_path: str = "clip_textual.onnx"): super().__init__() self.model = model self.visual_path = visual_path self.textual_path = textual_path self.visual_flag = False self.textual_flag = False self.logit_scale = self.model.logit_scale.exp() self.model.eval() for x in self.model.parameters(): x.requires_grad = False def quantization(self, mode: str = "dynamic"): assert mode in ["dynamic"] if mode == "dynamic": model_quant_visual = f"{self.visual_path}.quant" quantize_dynamic(self.visual_path, model_quant_visual, weight_type=QuantType.QUInt8) self.visual_path = model_quant_visual model_quant_textual = f"{self.textual_path}.quant" quantize_dynamic(self.textual_path, model_quant_textual, weight_type=QuantType.QUInt8) self.textual_path = model_quant_textual def torch_export(self, model, dummy_input, path: str, export_params=DEFAULT_EXPORT): torch.onnx.export(model, dummy_input, path, **export_params) def onnx_checker(self, path: str): model = onnx.load(path) onnx.checker.check_model(model) del model def convert_visual(self, dummy_input, wrapper=lambda x: x, export_params=DEFAULT_EXPORT): visual = wrapper(self.model.visual) self.torch_export(visual, dummy_input, self.visual_path, export_params=export_params) self.onnx_checker(self.visual_path) def convert_textual(self, dummy_input, wrapper=Textual, export_params=DEFAULT_EXPORT): textual = wrapper(self.model) self.torch_export(textual, dummy_input, self.textual_path, export_params=export_params) self.onnx_checker(self.textual_path) def convert2onnx(self, visual_input=None, textual_input=None, verbose=True, visual_wrapper=lambda x: x, textual_wrapper=Textual, visual_export_params=DEFAULT_EXPORT, textual_export_params=DEFAULT_EXPORT): isinstance_visual_input = isinstance(visual_input, (torch.Tensor)) isinstance_textual_input = isinstance(textual_input, (torch.Tensor)) if (not isinstance_visual_input) and (not isinstance_textual_input): raise Exception("[CLIP ONNX] Please, choose a dummy input") elif not isinstance_visual_input: print("[CLIP ONNX] Convert only textual model") elif not isinstance_textual_input: print("[CLIP ONNX] Convert only visual model") if isinstance_visual_input: self.visual_flag = True if verbose: print("[CLIP ONNX] Start convert visual model") self.convert_visual(visual_input, visual_wrapper, visual_export_params) if verbose: print("[CLIP ONNX] Start check visual model") self.onnx_checker(self.visual_path) if isinstance_textual_input: self.textual_flag = True if verbose: print("[CLIP ONNX] Start convert textual model") self.convert_textual(textual_input, textual_wrapper, textual_export_params) if verbose: print("[CLIP ONNX] Start check textual model") self.onnx_checker(self.textual_path) if verbose: print("[CLIP ONNX] Models converts successfully") ================================================ FILE: clip_onnx/clip_onnx.py ================================================ from .clip_converter import clip_converter import torch import onnxruntime class clip_onnx(clip_converter): def __init__(self, model=None, visual_path: str = "clip_visual.onnx", textual_path: str = "clip_textual.onnx"): if not isinstance(model, (type(None))): super().__init__(model, visual_path, textual_path) else: print("[CLIP ONNX] Load mode") def load_onnx(self, visual_path=None, textual_path=None, logit_scale=None): if visual_path and textual_path: if not logit_scale: raise Exception("For this mode logit_scale must be specified. Example: model.logit_scale.exp()") self.logit_scale = logit_scale if visual_path: self.visual_path = visual_path self.visual_flag = True if textual_path: self.textual_path = textual_path self.textual_flag = True def start_sessions(self, providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']): if self.visual_flag: self.visual_session = onnxruntime.InferenceSession(self.visual_path, providers=providers) if self.textual_flag: self.textual_session = onnxruntime.InferenceSession(self.textual_path, providers=providers) def visual_run(self, onnx_image): onnx_input_image = {self.visual_session.get_inputs()[0].name: onnx_image} visual_output, = self.visual_session.run(None, onnx_input_image) return visual_output def textual_run(self, onnx_text): onnx_input_text = {self.textual_session.get_inputs()[0].name: onnx_text} textual_output, = self.textual_session.run(None, onnx_input_text) return textual_output def __call__(self, image, text, device: str = "cpu"): assert self.visual_flag and self.textual_flag image_features = torch.from_numpy(self.visual_run(image)).to(device) text_features = torch.from_numpy(self.textual_run(text)).to(device) # normalized features image_features = image_features / image_features.norm(dim=-1, keepdim=True) text_features = text_features / text_features.norm(dim=-1, keepdim=True) # cosine similarity as logits logits_per_image = self.logit_scale * image_features @ text_features.t() logits_per_text = logits_per_image.t() # shape = [global_batch_size, global_batch_size] return logits_per_image, logits_per_text def encode_image(self, image): return self.visual_run(image) def encode_text(self, text): return self.textual_run(text) ================================================ FILE: clip_onnx/utils.py ================================================ import torch.nn.functional as F import torch from torch import nn class Textual(nn.Module): def __init__(self, model): super().__init__() self.transformer = model.transformer self.positional_embedding = model.positional_embedding self.transformer = model.transformer self.ln_final = model.ln_final self.text_projection = model.text_projection self.token_embedding = model.token_embedding def forward(self, text): x = self.token_embedding(text) # [batch_size, n_ctx, d_model] x = x + self.positional_embedding x = x.permute(1, 0, 2) # NLD -> LND x = self.transformer(x) x = x.permute(1, 0, 2) # LND -> NLD x = self.ln_final(x) # x.shape = [batch_size, n_ctx, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) # needs .float() before .argmax( ) to work x = x[torch.arange(x.shape[0]), text.float().argmax(dim=-1)] @ self.text_projection return x def attention(self, x: torch.Tensor): # onnx doesn't like multi_head_attention_forward so this is a reimplementation self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None q, k, v = (torch.einsum("tbh, oh -> tbo", x, self.attn.in_proj_weight) + self.attn.in_proj_bias).contiguous().chunk( 3, dim=-1) tgt_len = q.shape[0] bsz = q.shape[1] num_heads = self.attn.num_heads head_dim = q.shape[2] // num_heads attn_output = scaled_dot_product_attention( q.reshape(tgt_len, bsz * num_heads, head_dim).transpose(0, 1), k.reshape(tgt_len, bsz * num_heads, head_dim).transpose(0, 1), v.reshape(tgt_len, bsz * num_heads, head_dim).transpose(0, 1), self.attn_mask, 0.0 ) attn_output = attn_output.transpose(0, 1).contiguous().view(q.shape) attn_output = F.linear(attn_output, self.attn.out_proj.weight, self.attn.out_proj.bias) return attn_output def scaled_dot_product_attention(Q, K, V, attn_mask, dropout_p): if attn_mask is None: attn_weight = torch.softmax(Q @ K.transpose(-2, -1) / Q.size(-1)**0.5, dim=-1) else: attn_weight = torch.softmax(Q @ K.transpose(-2, -1) / Q.size(-1)**0.5 + attn_mask[None, ...], dim=-1) # attn_weight = torch.dropout(attn_weight, dropout_p) # this is always 0.0 in CLIP so I comment it out. return attn_weight @ V DEFAULT_EXPORT = dict(input_names=['input'], output_names=['output'], export_params=True, verbose=False, opset_version=12, do_constant_folding=True, dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}) ================================================ FILE: examples/RuCLIP_onnx_example.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "RuCLIP_onnx_example.ipynb", "provenance": [], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "source": [ "#@title Allowed Resources\n", "import multiprocessing\n", "import torch\n", "from psutil import virtual_memory\n", "\n", "ram_gb = round(virtual_memory().total / 1024**3, 1)\n", "\n", "print('CPU:', multiprocessing.cpu_count())\n", "print('RAM GB:', ram_gb)\n", "print(\"PyTorch version:\", torch.__version__)\n", "print(\"CUDA version:\", torch.version.cuda)\n", "print(\"cuDNN version:\", torch.backends.cudnn.version())\n", "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", "print(\"device:\", device.type)\n", "\n", "!nvidia-smi" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "cellView": "form", "id": "4gfq46gnYcnU", "outputId": "41e2054a-e2e4-4bb5-ed39-8bd8bfc639c3" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "CPU: 2\n", "RAM GB: 12.7\n", "PyTorch version: 1.10.0+cu111\n", "CUDA version: 11.1\n", "cuDNN version: 8005\n", "device: cuda\n", "Wed Jan 19 22:10:10 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 495.46 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", "| N/A 41C P8 9W / 70W | 3MiB / 15109MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Restart colab session after installation\n", "Reload the session if something doesn't work" ], "metadata": { "id": "whlsBiJgR8le" } }, { "cell_type": "code", "source": [ "%%capture\n", "!pip install git+https://github.com/Lednik7/CLIP-ONNX.git\n", "!pip install ruclip==0.0.1rc7\n", "!pip install onnxruntime-gpu" ], "metadata": { "id": "HnbpAkvuR73L" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "%%capture\n", "!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true" ], "metadata": { "id": "tqy0zKM4R-7M" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "import onnxruntime\n", "\n", "# priority device (if available)\n", "print(onnxruntime.get_device())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "x8IN72OnSAIh", "outputId": "3174cf2c-ace3-4e1f-a550-e16c72302d51" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "GPU\n" ] } ] }, { "cell_type": "markdown", "source": [ "## RuCLIP\n", "WARNING: specific RuCLIP like forward \"model(text, image)\" instead of classic(OpenAI CLIP) \"model(image, text)\"" ], "metadata": { "id": "8_wSsSheT5mw" } }, { "cell_type": "code", "source": [ "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\", category=UserWarning)" ], "metadata": { "id": "gZTxanR26knr" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "import ruclip\n", "\n", "# onnx cannot export with cuda\n", "model, processor = ruclip.load(\"ruclip-vit-base-patch32-384\", device=\"cpu\")" ], "metadata": { "id": "FdTLuqsJUBFY" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from PIL import Image\n", "import numpy as np\n", "\n", "# simple input\n", "pil_images = [Image.open(\"CLIP.png\")]\n", "labels = ['диаграмма', 'собака', 'кошка']\n", "dummy_input = processor(text=labels, images=pil_images,\n", " return_tensors='pt', padding=True)\n", "\n", "# batch first\n", "image = dummy_input[\"pixel_values\"] # torch tensor [1, 3, 384, 384]\n", "image_onnx = dummy_input[\"pixel_values\"].cpu().detach().numpy().astype(np.float32)\n", "\n", "# batch first\n", "text = dummy_input[\"input_ids\"] # torch tensor [3, 77]\n", "text_onnx = dummy_input[\"input_ids\"].cpu().detach().numpy()[::-1].astype(np.int64)" ], "metadata": { "id": "rPwc6A2SSGyl" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "#RuCLIP output\n", "logits_per_image, logits_per_text = model(text, image)\n", "probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()\n", "\n", "print(\"Label probs:\", probs) # prints: [[0.9885839 0.00894288 0.0024732 ]]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pv0mH626SdzO", "outputId": "d563462f-b2a9-4d49-b491-17e88ffa81f0" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Label probs: [[0.9885839 0.00894288 0.0024732 ]]\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Convert RuCLIP model to ONNX" ], "metadata": { "id": "R_e5OjJeXRiF" } }, { "cell_type": "code", "source": [ "from clip_onnx import clip_onnx\n", "\n", "visual_path = \"clip_visual.onnx\"\n", "textual_path = \"clip_textual.onnx\"\n", "\n", "onnx_model = clip_onnx(model, visual_path=visual_path, textual_path=textual_path)\n", "onnx_model.convert2onnx(image, text, verbose=True)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oYM5FDSGSJBW", "outputId": "c647dc2e-946d-4769-c66e-77edfa98237f" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start convert visual model\n", "[CLIP ONNX] Start check visual model\n", "[CLIP ONNX] Start convert textual model\n", "[CLIP ONNX] Start check textual model\n", "[CLIP ONNX] Models converts successfully\n" ] } ] }, { "cell_type": "markdown", "source": [ "## [ONNX] CPU inference mode" ], "metadata": { "id": "U1Pr-YTtSEhs" } }, { "cell_type": "code", "source": [ "# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n", "onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # cpu mode" ], "metadata": { "id": "aY9wRe5kT3wG" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "image_features = onnx_model.encode_image(image_onnx)\n", "text_features = onnx_model.encode_text(text_onnx)\n", "\n", "logits_per_image, logits_per_text = onnx_model(image_onnx, text_onnx)\n", "probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()\n", "\n", "print(\"Label probs:\", probs) # prints: Label probs: [[0.90831375 0.07174418 0.01994203]]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tYVuk72nSLw6", "outputId": "75bf3803-6ed7-4516-ccd0-42f9cf7f22e0" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Label probs: [[0.90831375 0.07174418 0.01994203]]\n" ] } ] }, { "cell_type": "code", "source": [ "%timeit onnx_model.encode_text(text_onnx) # text representation" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Bpu4_HFRVeNk", "outputId": "e8f1681b-40dc-495f-d382-f0348d87c412" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "1 loop, best of 5: 285 ms per loop\n" ] } ] }, { "cell_type": "code", "source": [ "%timeit onnx_model.encode_image(image_onnx) # image representation" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JsOccP2gVmpo", "outputId": "adb33860-b000-461b-959f-95126e2ac049" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "1 loop, best of 5: 412 ms per loop\n" ] } ] }, { "cell_type": "markdown", "source": [ "## [ONNX] GPU inference mode" ], "metadata": { "id": "Zww0E-jIULug" } }, { "cell_type": "code", "source": [ "onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # cuda mode" ], "metadata": { "id": "PBakYeiQUOAm" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "%timeit onnx_model.encode_text(text_onnx) # text representation" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EjvRBvCaWJBL", "outputId": "07426652-1cc5-4713-c355-fb4f1bd138d4" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The slowest run took 5.07 times longer than the fastest. This could mean that an intermediate result is being cached.\n", "100 loops, best of 5: 6.89 ms per loop\n" ] } ] }, { "cell_type": "code", "source": [ "%timeit onnx_model.encode_image(image_onnx) # image representation" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pmu4mQCsWJ8w", "outputId": "5cb45026-dfd3-419d-e5d3-f5d0d9681cd0" }, "execution_count": 12, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The slowest run took 699.84 times longer than the fastest. This could mean that an intermediate result is being cached.\n", "1 loop, best of 5: 18.9 ms per loop\n" ] } ] } ] } ================================================ FILE: examples/clip_onnx_example.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "clip_onnx_example.ipynb", "provenance": [], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "## Restart colab session after installation\n", "Reload the session if something doesn't work" ], "metadata": { "id": "fxPg_VvZuScV" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "al_QNjyFq6Jj" }, "outputs": [], "source": [ "%%capture\n", "!pip install git+https://github.com/Lednik7/CLIP-ONNX.git\n", "!pip install git+https://github.com/openai/CLIP.git\n", "!pip install onnxruntime-gpu" ] }, { "cell_type": "code", "source": [ "%%capture\n", "!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true" ], "metadata": { "id": "42eeJz9lTdJ6" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "!nvidia-smi" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XuauIZIBSEUX", "outputId": "2c7c2bd9-90dd-4b1a-e98a-79e1f2218644" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Thu Jan 6 16:36:44 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 495.44 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 |\n", "| N/A 35C P8 26W / 149W | 0MiB / 11441MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ] } ] }, { "cell_type": "code", "source": [ "import onnxruntime\n", "print(onnxruntime.get_device())" ], "metadata": { "id": "gqvxpdajRX5_" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## CPU inference mode" ], "metadata": { "id": "010k-ksVTjAu" } }, { "cell_type": "markdown", "source": [ "### Torch CLIP" ], "metadata": { "id": "KdTz0IJWVBqE" } }, { "cell_type": "code", "source": [ "import clip\n", "from PIL import Image\n", "import numpy as np\n", "\n", "# onnx cannot work with cuda\n", "model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n", "\n", "# batch first\n", "image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0).cpu() # [1, 3, 224, 224]\n", "image_onnx = image.detach().cpu().numpy().astype(np.float32)\n", "\n", "# batch first\n", "text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]).cpu() # [3, 77]\n", "text_onnx = text.detach().cpu().numpy().astype(np.int64)" ], "metadata": { "id": "9ROPwKYurOhP" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "%timeit model(image, text)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1CrHQ8cYt8Cx", "outputId": "4d98f85d-4b02-4ae2-b18f-fb3c7a2d6caf" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "1 loop, best of 5: 636 ms per loop\n" ] } ] }, { "cell_type": "markdown", "source": [ "### CLIP-ONNX" ], "metadata": { "id": "Ao2MriaVVG6Y" } }, { "cell_type": "code", "source": [ "from clip_onnx import clip_onnx, attention\n", "clip.model.ResidualAttentionBlock.attention = attention\n", "\n", "onnx_model = clip_onnx(model)\n", "onnx_model.convert2onnx(image, text, verbose=True)\n", "# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n", "onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # cpu mode" ], "metadata": { "id": "nSeG9uAZrcph", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8c394684-d78e-49f6-a60f-872485d5f650" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start convert visual model\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/clip_onnx/utils.py:40: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n", " head_dim = q.shape[2] // num_heads\n", "/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_helper.py:716: UserWarning: allowzero=0 by default. In order to honor zero value in shape use allowzero=1\n", " warnings.warn(\"allowzero=0 by default. In order to honor zero value in shape use allowzero=1\")\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start check visual model\n", "[CLIP ONNX] Start convert textual model\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_opset9.py:2819: UserWarning: Exporting aten::index operator of advanced indexing in opset 14 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n", " \"If indices include negative values, the exported graph will produce incorrect results.\")\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start check textual model\n", "[CLIP ONNX] Models converts successfully\n" ] } ] }, { "cell_type": "code", "source": [ "%timeit onnx_model(image_onnx, text_onnx)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "B15dr51UrvMh", "outputId": "7c5fbc64-61f5-4742-d5a1-24d123971515" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "1 loop, best of 5: 550 ms per loop\n" ] } ] }, { "cell_type": "markdown", "source": [ "## GPU inference mode\n", "Select a runtime GPU to continue:\n", "\n", "Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU" ], "metadata": { "id": "Ahh_7CkTUb8y" } }, { "cell_type": "markdown", "source": [ "### CLIP-ONNX" ], "metadata": { "id": "B6M7yq7qceb5" } }, { "cell_type": "code", "source": [ "onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # GPU mode" ], "metadata": { "id": "6LtPSZhfUd_m" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "onnx_model.visual_session.get_providers() # optional" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xE0VGt9sQwrf", "outputId": "6feb4701-7b7f-437e-dc2f-c95c504dbb89" }, "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['CUDAExecutionProvider', 'CPUExecutionProvider']" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "source": [ "%timeit onnx_model(image_onnx, text_onnx)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "iPUVzqmgcYas", "outputId": "3e7c1526-6e38-4982-ca36-eabfc95c2ab9" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The slowest run took 79.70 times longer than the fastest. This could mean that an intermediate result is being cached.\n", "1 loop, best of 5: 60.8 ms per loop\n" ] } ] }, { "cell_type": "markdown", "source": [ "### Torch CLIP" ], "metadata": { "id": "jb58mrkbch2V" } }, { "cell_type": "code", "source": [ "import clip\n", "from PIL import Image\n", "\n", "device = \"cuda\"\n", "# onnx cannot work with cuda\n", "model, preprocess = clip.load(\"ViT-B/32\", device=device, jit=False)\n", "# batch first\n", "image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0).to(device) # [1, 3, 224, 224]\n", "text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]).to(device) # [3, 77]" ], "metadata": { "id": "gidR99GOckyF" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "%timeit model(image, text)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XpBrtjlOcwOC", "outputId": "56375401-18a0-499b-f29b-c6e2d4d07e42" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "10 loops, best of 5: 72.2 ms per loop\n" ] } ] } ] } ================================================ FILE: examples/dev/clip_onnx_benchmark_cpu.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "clip-onnx-benchmark-cpu.ipynb", "provenance": [], "authorship_tag": "ABX9TyNUvpypuYYk54s1lZecP8Pf", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "## Restart colab session after installation\n", "Reload the session if something doesn't work" ], "metadata": { "id": "fxPg_VvZuScV" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "al_QNjyFq6Jj" }, "outputs": [], "source": [ "%%capture\n", "!pip install git+https://github.com/Lednik7/CLIP-ONNX.git@dev\n", "!pip install git+https://github.com/openai/CLIP.git\n", "!pip install onnxruntime-gpu" ] }, { "cell_type": "code", "source": [ "%%capture\n", "!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true" ], "metadata": { "id": "42eeJz9lTdJ6" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "!nvidia-smi" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XuauIZIBSEUX", "outputId": "7e3fa9a5-2970-4bc1-81e5-9ec997a267a1" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Tue May 3 06:56:57 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", "| N/A 47C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ] } ] }, { "cell_type": "code", "source": [ "import onnxruntime\n", "print(onnxruntime.get_device())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gqvxpdajRX5_", "outputId": "4ad23904-186a-4e19-af9a-66538a70a3c8" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "GPU\n" ] } ] }, { "cell_type": "markdown", "source": [ "## GPU inference mode\n", "Select a runtime GPU to continue:\n", "\n", "Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU" ], "metadata": { "id": "010k-ksVTjAu" } }, { "cell_type": "markdown", "source": [ "### Torch CLIP" ], "metadata": { "id": "KdTz0IJWVBqE" } }, { "cell_type": "code", "source": [ "import clip\n", "from PIL import Image\n", "import numpy as np\n", "\n", "# onnx cannot work with cuda\n", "model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n", "\n", "# batch first\n", "image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0) # [1, 3, 224, 224]\n", "image_onnx = image.detach().cpu().numpy().astype(np.float32)\n", "\n", "# batch first\n", "text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]) # [3, 77]\n", "text_onnx = text.detach().cpu().numpy().astype(np.int32)" ], "metadata": { "id": "9ROPwKYurOhP" }, "execution_count": 4, "outputs": [] }, { "cell_type": "markdown", "source": [ "### CLIP-ONNX" ], "metadata": { "id": "Ao2MriaVVG6Y" } }, { "cell_type": "code", "source": [ "from clip_onnx import clip_onnx\n", "\n", "onnx_model = clip_onnx(model)\n", "onnx_model.convert2onnx(image, text, verbose=True)\n", "# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n", "onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # GPU mode" ], "metadata": { "id": "nSeG9uAZrcph", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "32e7fb6e-191a-4c3a-a8be-42ddf41ee62d" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start convert visual model\n", "[CLIP ONNX] Start check visual model\n", "[CLIP ONNX] Start convert textual model\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_opset9.py:2909: UserWarning: Exporting aten::index operator of advanced indexing in opset 12 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n", " \"If indices include negative values, the exported graph will produce incorrect results.\")\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start check textual model\n", "[CLIP ONNX] Models converts successfully\n" ] } ] }, { "cell_type": "code", "source": [ "onnx_model = clip_onnx(model)\n", "onnx_model.load_onnx(\"/content/clip_visual.onnx\",\n", " \"/content/clip_textual.onnx\",\n", " model.logit_scale.exp())\n", "onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # GPU mode" ], "metadata": { "id": "PsDS7ty79zZf" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "onnx_model.visual_session.get_providers()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aZsGJNrbNCYe", "outputId": "27eec69c-6535-46e1-d98a-15836459149e" }, "execution_count": 8, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['CPUExecutionProvider']" ] }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "markdown", "source": [ "## Benchmark" ], "metadata": { "id": "J5IcOG_6jAFz" } }, { "cell_type": "code", "source": [ "model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)" ], "metadata": { "id": "SJ_5_x7vLepK" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "model.eval()\n", "for x in model.parameters():\n", " x.requires_grad = False" ], "metadata": { "id": "OnOzZ3LMuubW" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "import numpy, random, torch" ], "metadata": { "id": "wDwqRRrTGKUS" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "def set_seed():\n", " torch.manual_seed(12)\n", " torch.cuda.manual_seed(12)\n", " np.random.seed(12)\n", " random.seed(12)\n", "\n", " torch.backends.cudnn.deterministic=True" ], "metadata": { "id": "9H17n_6gGJgT" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "import torch\n", "import time\n", "\n", "n = 5\n", "clip_results = {\"encode_image\": [],\n", " \"encode_text\": []}\n", "onnx_results = {\"encode_image\": [],\n", " \"encode_text\": []}\n", "for batch in [2, 8, 16, 32, 64]:\n", " set_seed()\n", " t_mean = []\n", " for _ in range(n):\n", " image_input = torch.randint(1, 255, (batch, 3, 224, 224))\n", " image_input_onnx = image_input.detach().cpu().numpy().astype(np.float32)\n", " t = time.time()\n", " onnx_model.encode_image(image_input_onnx)\n", " t_mean.append(time.time() - t)\n", " print(\"onnx\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " onnx_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " set_seed()\n", " with torch.inference_mode():\n", " t_mean = []\n", " for _ in range(n):\n", " image_input = torch.randint(1, 255, (batch, 3, 224, 224))\n", " t = time.time()\n", " model.encode_image(image_input)\n", " t_mean.append(time.time() - t)\n", " print(\"torch\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " clip_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " set_seed()\n", " t_mean = []\n", " for _ in range(n):\n", " text_input = torch.randint(320, 49407, (batch, 77))\n", " text_input_onnx = text_input.detach().cpu().numpy().astype(np.int32)\n", " t = time.time()\n", " onnx_model.encode_text(text_input_onnx)\n", " t_mean.append(time.time() - t)\n", " print(\"onnx\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " onnx_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " set_seed()\n", " with torch.inference_mode():\n", " t_mean = []\n", " for _ in range(n):\n", " text_input = torch.randint(320, 49407, (batch, 77))\n", " t = time.time()\n", " model.encode_text(text_input)\n", " t_mean.append(time.time() - t)\n", " print(\"torch\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " clip_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " print(\"-\" * 78)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4lFL6tzWjiWL", "outputId": "45819718-619e-429c-9aa4-7e28b068b9a3" }, "execution_count": 13, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "onnx 2 encode_image 0.234\n", "torch 2 encode_image 0.343\n", "onnx 2 encode_text 0.162\n", "torch 2 encode_text 0.243\n", "------------------------------------------------------------------------------\n", "onnx 8 encode_image 0.923\n", "torch 8 encode_image 1.093\n", "onnx 8 encode_text 0.656\n", "torch 8 encode_text 0.831\n", "------------------------------------------------------------------------------\n", "onnx 16 encode_image 2.079\n", "torch 16 encode_image 1.952\n", "onnx 16 encode_text 1.288\n", "torch 16 encode_text 1.523\n", "------------------------------------------------------------------------------\n", "onnx 32 encode_image 3.937\n", "torch 32 encode_image 4.079\n", "onnx 32 encode_text 2.658\n", "torch 32 encode_text 3.015\n", "------------------------------------------------------------------------------\n", "onnx 64 encode_image 7.944\n", "torch 64 encode_image 8.07\n", "onnx 64 encode_text 5.567\n", "torch 64 encode_text 6.212\n", "------------------------------------------------------------------------------\n" ] } ] }, { "cell_type": "code", "source": [ "import pandas as pd" ], "metadata": { "id": "P2YhbE9v_4ci" }, "execution_count": 14, "outputs": [] }, { "cell_type": "code", "source": [ "pd.DataFrame({\"backend\": [\"onnx\", \"torch\"] * 5,\n", " \"batch\": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],\n", " \"encode_image\": [j[1] for i in zip(onnx_results[\"encode_image\"],\n", " clip_results[\"encode_image\"]) for j in i],\n", " \"encode_text\": [j[1] for i in zip(onnx_results[\"encode_text\"],\n", " clip_results[\"encode_text\"]) for j in i]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 362 }, "id": "WfZfDk4PAlqm", "outputId": "38710ad6-09ae-4c48-fc20-1cdabf4c2a50" }, "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " backend batch encode_image encode_text\n", "0 onnx 2 0.234 0.162\n", "1 torch 2 0.343 0.243\n", "2 onnx 8 0.923 0.656\n", "3 torch 8 1.093 0.831\n", "4 onnx 16 2.079 1.288\n", "5 torch 16 1.952 1.523\n", "6 onnx 32 3.937 2.658\n", "7 torch 32 4.079 3.015\n", "8 onnx 64 7.944 5.567\n", "9 torch 64 8.070 6.212" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
backendbatchencode_imageencode_text
0onnx20.2340.162
1torch20.3430.243
2onnx80.9230.656
3torch81.0930.831
4onnx162.0791.288
5torch161.9521.523
6onnx323.9372.658
7torch324.0793.015
8onnx647.9445.567
9torch648.0706.212
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "onnx_df = pd.DataFrame({\"ONNX\": [\"ViT-B/32\"] * 5,\n", " \"batch\": [2, 8, 16, 32, 64],\n", " \"encode_image\": [i[1] for i in onnx_results[\"encode_image\"]],\n", " \"encode_text\": [i[1] for i in onnx_results[\"encode_text\"]]})\n", "onnx_df[\"total\"] = onnx_df[\"encode_image\"] + onnx_df[\"encode_text\"]" ], "metadata": { "id": "Xpw9lV7yBbA8" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "onnx_df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "LItAyQkeDhnQ", "outputId": "37517a71-baf3-494c-8a46-9f05cbfb7d32" }, "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ONNX batch encode_image encode_text total\n", "0 ViT-B/32 2 0.234 0.162 0.396\n", "1 ViT-B/32 8 0.923 0.656 1.579\n", "2 ViT-B/32 16 2.079 1.288 3.367\n", "3 ViT-B/32 32 3.937 2.658 6.595\n", "4 ViT-B/32 64 7.944 5.567 13.511" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ONNXbatchencode_imageencode_texttotal
0ViT-B/3220.2340.1620.396
1ViT-B/3280.9230.6561.579
2ViT-B/32162.0791.2883.367
3ViT-B/32323.9372.6586.595
4ViT-B/32647.9445.56713.511
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [ "print(onnx_df.to_markdown(index=False))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AIQDA9FaJZ7Y", "outputId": "8e8d4109-822e-4328-b2ca-66d4b9a19f8d" }, "execution_count": 18, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "| ONNX | batch | encode_image | encode_text | total |\n", "|:---------|--------:|---------------:|--------------:|--------:|\n", "| ViT-B/32 | 2 | 0.234 | 0.162 | 0.396 |\n", "| ViT-B/32 | 8 | 0.923 | 0.656 | 1.579 |\n", "| ViT-B/32 | 16 | 2.079 | 1.288 | 3.367 |\n", "| ViT-B/32 | 32 | 3.937 | 2.658 | 6.595 |\n", "| ViT-B/32 | 64 | 7.944 | 5.567 | 13.511 |\n" ] } ] }, { "cell_type": "code", "source": [ "clip_df = pd.DataFrame({\"TORCH\": [\"ViT-B/32\"] * 5,\n", " \"batch\": [2, 8, 16, 32, 64],\n", " \"encode_image\": [i[1] for i in clip_results[\"encode_image\"]],\n", " \"encode_text\": [i[1] for i in clip_results[\"encode_text\"]]})\n", "clip_df[\"total\"] = clip_df[\"encode_image\"] + clip_df[\"encode_text\"]" ], "metadata": { "id": "E1OXQUDvDZmI" }, "execution_count": 19, "outputs": [] }, { "cell_type": "code", "source": [ "print(clip_df.to_markdown(index=False))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xAj-ynhCDpPO", "outputId": "88243c7f-bd6d-4a63-9ee2-154440c3df7e" }, "execution_count": 20, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "| TORCH | batch | encode_image | encode_text | total |\n", "|:---------|--------:|---------------:|--------------:|--------:|\n", "| ViT-B/32 | 2 | 0.343 | 0.243 | 0.586 |\n", "| ViT-B/32 | 8 | 1.093 | 0.831 | 1.924 |\n", "| ViT-B/32 | 16 | 1.952 | 1.523 | 3.475 |\n", "| ViT-B/32 | 32 | 4.079 | 3.015 | 7.094 |\n", "| ViT-B/32 | 64 | 8.07 | 6.212 | 14.282 |\n" ] } ] } ] } ================================================ FILE: examples/dev/clip_onnx_benchmark_gpu.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "clip-onnx-benchmark-gpu.ipynb", "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "source": [ "## Restart colab session after installation\n", "Reload the session if something doesn't work" ], "metadata": { "id": "fxPg_VvZuScV" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "al_QNjyFq6Jj" }, "outputs": [], "source": [ "%%capture\n", "!pip install git+https://github.com/Lednik7/CLIP-ONNX.git\n", "!pip install git+https://github.com/openai/CLIP.git\n", "!pip install onnxruntime-gpu" ] }, { "cell_type": "code", "source": [ "%%capture\n", "!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true" ], "metadata": { "id": "42eeJz9lTdJ6" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "!nvidia-smi" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XuauIZIBSEUX", "outputId": "7e2b352b-751e-439e-bb3d-4e1323e2e44d" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Thu Jan 6 15:47:04 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 495.44 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 |\n", "| N/A 34C P8 28W / 149W | 0MiB / 11441MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ] } ] }, { "cell_type": "code", "source": [ "import onnxruntime\n", "print(onnxruntime.get_device())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gqvxpdajRX5_", "outputId": "7c44b4e1-d916-42d9-cc61-52efdf0fa9a9" }, "execution_count": 20, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "GPU\n" ] } ] }, { "cell_type": "markdown", "source": [ "## GPU inference mode\n", "Select a runtime GPU to continue:\n", "\n", "Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU" ], "metadata": { "id": "010k-ksVTjAu" } }, { "cell_type": "markdown", "source": [ "### Torch CLIP" ], "metadata": { "id": "KdTz0IJWVBqE" } }, { "cell_type": "code", "source": [ "import clip\n", "from PIL import Image\n", "import numpy as np\n", "\n", "# onnx cannot work with cuda\n", "model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n", "\n", "# batch first\n", "image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0) # [1, 3, 224, 224]\n", "image_onnx = image.detach().cpu().numpy().astype(np.float32)\n", "\n", "# batch first\n", "text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]) # [3, 77]\n", "text_onnx = text.detach().cpu().numpy().astype(np.int64)" ], "metadata": { "id": "9ROPwKYurOhP" }, "execution_count": 1, "outputs": [] }, { "cell_type": "markdown", "source": [ "### CLIP-ONNX" ], "metadata": { "id": "Ao2MriaVVG6Y" } }, { "cell_type": "code", "source": [ "from clip_onnx import clip_onnx, attention\n", "clip.model.ResidualAttentionBlock.attention = attention\n", "\n", "onnx_model = clip_onnx(model)\n", "onnx_model.convert2onnx(image, text, verbose=False)\n", "# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n", "onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # GPU mode" ], "metadata": { "id": "nSeG9uAZrcph", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "25e07d68-6ef2-44c4-d144-c43b611f3316" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/clip_onnx/utils.py:40: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n", " head_dim = q.shape[2] // num_heads\n", "/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_helper.py:716: UserWarning: allowzero=0 by default. In order to honor zero value in shape use allowzero=1\n", " warnings.warn(\"allowzero=0 by default. In order to honor zero value in shape use allowzero=1\")\n", "/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_opset9.py:2819: UserWarning: Exporting aten::index operator of advanced indexing in opset 14 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n", " \"If indices include negative values, the exported graph will produce incorrect results.\")\n" ] } ] }, { "cell_type": "code", "source": [ "from clip_onnx import clip_onnx, attention\n", "clip.model.ResidualAttentionBlock.attention = attention" ], "metadata": { "id": "imMVbHFO-KSH" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "onnx_model = clip_onnx(model)\n", "onnx_model.load_onnx(\"/content/clip_visual.onnx\",\n", " \"/content/clip_textual.onnx\",\n", " model.logit_scale.exp())\n", "onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # GPU mode" ], "metadata": { "id": "PsDS7ty79zZf" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "onnx_model.visual_session.get_providers()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aZsGJNrbNCYe", "outputId": "9dcdd2d6-2a73-4dad-9ea7-c2892273c631" }, "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['CUDAExecutionProvider', 'CPUExecutionProvider']" ] }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "markdown", "source": [ "## Benchmark" ], "metadata": { "id": "J5IcOG_6jAFz" } }, { "cell_type": "code", "source": [ "model, preprocess = clip.load(\"ViT-B/32\", device=\"cuda\", jit=False)" ], "metadata": { "id": "SJ_5_x7vLepK" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "model.eval()\n", "for x in model.parameters():\n", " x.requires_grad = False" ], "metadata": { "id": "OnOzZ3LMuubW" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "import numpy, random, torch" ], "metadata": { "id": "wDwqRRrTGKUS" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "def set_seed():\n", " torch.manual_seed(12)\n", " torch.cuda.manual_seed(12)\n", " np.random.seed(12)\n", " random.seed(12)\n", "\n", " torch.backends.cudnn.deterministic=True" ], "metadata": { "id": "9H17n_6gGJgT" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "%timeit onnx_model.encode_image(image_onnx)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IsJ2TsBRNh8f", "outputId": "bb642ee7-0112-4195-be35-14fdf719e7bc" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "The slowest run took 23.27 times longer than the fastest. This could mean that an intermediate result is being cached.\n", "1 loop, best of 5: 20.1 ms per loop\n" ] } ] }, { "cell_type": "code", "source": [ "import torch\n", "import time\n", "\n", "n = 5\n", "clip_results = {\"encode_image\": [],\n", " \"encode_text\": []}\n", "onnx_results = {\"encode_image\": [],\n", " \"encode_text\": []}\n", "for batch in [2, 8, 16, 32, 64]:\n", " set_seed()\n", " image_input = torch.randint(1, 255, (batch, 3, 224, 224)).cuda()\n", " text_input = torch.randint(320, 49407, (batch, 77)).cuda()\n", " image_input_onnx = image_input.detach().cpu().numpy().astype(np.float32)\n", " text_input_onnx = text_input.detach().cpu().numpy().astype(np.int64)\n", "\n", " t_mean = []\n", " for _ in range(n):\n", " t = time.time()\n", " onnx_model.encode_image(image_input_onnx)\n", " t_mean.append(time.time() - t)\n", " print(\"onnx\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " onnx_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " with torch.inference_mode():\n", " t_mean = []\n", " for _ in range(n):\n", " t = time.time()\n", " model.encode_image(image_input)\n", " t_mean.append(time.time() - t)\n", " print(\"torch\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " clip_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " t_mean = []\n", " for _ in range(n):\n", " t = time.time()\n", " onnx_model.encode_text(text_input_onnx)\n", " t_mean.append(time.time() - t)\n", " print(\"onnx\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " onnx_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " with torch.inference_mode():\n", " t_mean = []\n", " for _ in range(n):\n", " t = time.time()\n", " model.encode_text(text_input)\n", " t_mean.append(time.time() - t)\n", " print(\"torch\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " clip_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " print(\"-\" * 78)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4lFL6tzWjiWL", "outputId": "a209b78a-fe78-4b46-9220-4b9624a1568f" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "onnx 2 encode_image 0.073\n", "torch 2 encode_image 0.041\n", "onnx 2 encode_text 0.032\n", "torch 2 encode_text 0.033\n", "------------------------------------------------------------------------------\n", "onnx 8 encode_image 0.088\n", "torch 8 encode_image 0.128\n", "onnx 8 encode_text 0.052\n", "torch 8 encode_text 0.102\n", "------------------------------------------------------------------------------\n", "onnx 16 encode_image 0.123\n", "torch 16 encode_image 0.258\n", "onnx 16 encode_text 0.08\n", "torch 16 encode_text 0.201\n", "------------------------------------------------------------------------------\n", "onnx 32 encode_image 0.196\n", "torch 32 encode_image 0.505\n", "onnx 32 encode_text 0.138\n", "torch 32 encode_text 0.386\n", "------------------------------------------------------------------------------\n", "onnx 64 encode_image 0.352\n", "torch 64 encode_image 0.995\n", "onnx 64 encode_text 0.252\n", "torch 64 encode_text 0.754\n", "------------------------------------------------------------------------------\n" ] } ] }, { "cell_type": "code", "source": [ "import pandas as pd" ], "metadata": { "id": "P2YhbE9v_4ci" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "pd.DataFrame({\"backend\": [\"onnx\", \"torch\"] * 5,\n", " \"batch\": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],\n", " \"encode_image\": [j[1] for i in zip(onnx_results[\"encode_image\"],\n", " clip_results[\"encode_image\"]) for j in i],\n", " \"encode_text\": [j[1] for i in zip(onnx_results[\"encode_text\"],\n", " clip_results[\"encode_text\"]) for j in i]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 362 }, "id": "WfZfDk4PAlqm", "outputId": "aa180c38-35f8-403c-a172-4e78266510d5" }, "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
backendbatchencode_imageencode_text
0onnx20.0730.032
1torch20.0410.033
2onnx80.0880.052
3torch80.1280.102
4onnx160.1230.080
5torch160.2580.201
6onnx320.1960.138
7torch320.5050.386
8onnx640.3520.252
9torch640.9950.754
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ], "text/plain": [ " backend batch encode_image encode_text\n", "0 onnx 2 0.073 0.032\n", "1 torch 2 0.041 0.033\n", "2 onnx 8 0.088 0.052\n", "3 torch 8 0.128 0.102\n", "4 onnx 16 0.123 0.080\n", "5 torch 16 0.258 0.201\n", "6 onnx 32 0.196 0.138\n", "7 torch 32 0.505 0.386\n", "8 onnx 64 0.352 0.252\n", "9 torch 64 0.995 0.754" ] }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "onnx_df = pd.DataFrame({\"ONNX\": [\"ViT-B/32\"] * 5,\n", " \"batch\": [2, 8, 16, 32, 64],\n", " \"encode_image\": [i[1] for i in onnx_results[\"encode_image\"]],\n", " \"encode_text\": [i[1] for i in onnx_results[\"encode_text\"]]})\n", "onnx_df[\"summary\"] = onnx_df[\"encode_image\"] + onnx_df[\"encode_text\"]" ], "metadata": { "id": "Xpw9lV7yBbA8" }, "execution_count": 13, "outputs": [] }, { "cell_type": "code", "source": [ "onnx_df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "LItAyQkeDhnQ", "outputId": "ebd84ad1-f305-4578-9164-2884aaa2b245" }, "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ONNXbatchencode_imageencode_textsummary
0ViT-B/3220.0730.0320.105
1ViT-B/3280.0880.0520.140
2ViT-B/32160.1230.0800.203
3ViT-B/32320.1960.1380.334
4ViT-B/32640.3520.2520.604
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ], "text/plain": [ " ONNX batch encode_image encode_text summary\n", "0 ViT-B/32 2 0.073 0.032 0.105\n", "1 ViT-B/32 8 0.088 0.052 0.140\n", "2 ViT-B/32 16 0.123 0.080 0.203\n", "3 ViT-B/32 32 0.196 0.138 0.334\n", "4 ViT-B/32 64 0.352 0.252 0.604" ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "print(onnx_df.to_markdown(index=False))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AIQDA9FaJZ7Y", "outputId": "4fdfd92a-5c8c-43d9-e875-7bcddc882113" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "| ONNX | batch | encode_image | encode_text | summary |\n", "|:---------|--------:|---------------:|--------------:|----------:|\n", "| ViT-B/32 | 2 | 0.073 | 0.032 | 0.105 |\n", "| ViT-B/32 | 8 | 0.088 | 0.052 | 0.14 |\n", "| ViT-B/32 | 16 | 0.123 | 0.08 | 0.203 |\n", "| ViT-B/32 | 32 | 0.196 | 0.138 | 0.334 |\n", "| ViT-B/32 | 64 | 0.352 | 0.252 | 0.604 |\n" ] } ] }, { "cell_type": "code", "source": [ "clip_df = pd.DataFrame({\"TORCH\": [\"ViT-B/32\"] * 5,\n", " \"batch\": [2, 8, 16, 32, 64],\n", " \"encode_image\": [i[1] for i in clip_results[\"encode_image\"]],\n", " \"encode_text\": [i[1] for i in clip_results[\"encode_text\"]]})\n", "clip_df[\"summary\"] = clip_df[\"encode_image\"] + clip_df[\"encode_text\"]" ], "metadata": { "id": "E1OXQUDvDZmI" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "print(clip_df.to_markdown(index=False))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xAj-ynhCDpPO", "outputId": "6a36903d-6bba-4675-8eb3-7f58af98e165" }, "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "| TORCH | batch | encode_image | encode_text | summary |\n", "|:---------|--------:|---------------:|--------------:|----------:|\n", "| ViT-B/32 | 2 | 0.041 | 0.033 | 0.074 |\n", "| ViT-B/32 | 8 | 0.128 | 0.102 | 0.23 |\n", "| ViT-B/32 | 16 | 0.258 | 0.201 | 0.459 |\n", "| ViT-B/32 | 32 | 0.505 | 0.386 | 0.891 |\n", "| ViT-B/32 | 64 | 0.995 | 0.754 | 1.749 |\n" ] } ] } ] } ================================================ FILE: examples/dev/clip_onnx_benchmark_gpu_K80.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "clip-onnx-benchmark-gpu-K80.ipynb", "provenance": [], "authorship_tag": "ABX9TyOXxz4T8v9RCW/JZlRRUtl4", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "## Restart colab session after installation\n", "Reload the session if something doesn't work" ], "metadata": { "id": "fxPg_VvZuScV" } }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "al_QNjyFq6Jj" }, "outputs": [], "source": [ "%%capture\n", "!pip install git+https://github.com/Lednik7/CLIP-ONNX.git@dev\n", "!pip install git+https://github.com/openai/CLIP.git\n", "!pip install onnxruntime-gpu" ] }, { "cell_type": "code", "source": [ "%%capture\n", "!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true" ], "metadata": { "id": "42eeJz9lTdJ6" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "!nvidia-smi" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XuauIZIBSEUX", "outputId": "3bfb5833-272d-4aa0-f296-edab8122547c" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Tue May 3 07:20:58 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 |\n", "| N/A 56C P8 29W / 149W | 0MiB / 11441MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ] } ] }, { "cell_type": "code", "source": [ "import onnxruntime\n", "print(onnxruntime.get_device())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gqvxpdajRX5_", "outputId": "bb8e9195-fe9c-421c-e27b-d76da7136b82" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "GPU\n" ] } ] }, { "cell_type": "markdown", "source": [ "## GPU inference mode\n", "Select a runtime GPU to continue:\n", "\n", "Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU" ], "metadata": { "id": "010k-ksVTjAu" } }, { "cell_type": "markdown", "source": [ "### Torch CLIP" ], "metadata": { "id": "KdTz0IJWVBqE" } }, { "cell_type": "code", "source": [ "import clip\n", "from PIL import Image\n", "import numpy as np\n", "\n", "# onnx cannot work with cuda\n", "model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n", "\n", "# batch first\n", "image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0) # [1, 3, 224, 224]\n", "image_onnx = image.detach().cpu().numpy().astype(np.float32)\n", "\n", "# batch first\n", "text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]) # [3, 77]\n", "text_onnx = text.detach().cpu().numpy().astype(np.int32)" ], "metadata": { "id": "9ROPwKYurOhP" }, "execution_count": 3, "outputs": [] }, { "cell_type": "markdown", "source": [ "### CLIP-ONNX" ], "metadata": { "id": "Ao2MriaVVG6Y" } }, { "cell_type": "code", "source": [ "from clip_onnx import clip_onnx\n", "from clip_onnx.utils import DEFAULT_EXPORT\n", "\n", "DEFAULT_EXPORT[\"opset_version\"] = 15\n", "\n", "onnx_model = clip_onnx(model)\n", "onnx_model.convert2onnx(image, text, verbose=True)\n", "# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n", "onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # GPU mode" ], "metadata": { "id": "nSeG9uAZrcph", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "1d4a8404-104f-4107-f2c4-e7e1f7b1d104" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start convert visual model\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_helper.py:719: UserWarning: allowzero=0 by default. In order to honor zero value in shape use allowzero=1\n", " warnings.warn(\"allowzero=0 by default. In order to honor zero value in shape use allowzero=1\")\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start check visual model\n", "[CLIP ONNX] Start convert textual model\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_opset9.py:2909: UserWarning: Exporting aten::index operator of advanced indexing in opset 15 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n", " \"If indices include negative values, the exported graph will produce incorrect results.\")\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start check textual model\n", "[CLIP ONNX] Models converts successfully\n" ] } ] }, { "cell_type": "code", "source": [ "onnx_model = clip_onnx(model)\n", "onnx_model.load_onnx(\"/content/clip_visual.onnx\",\n", " \"/content/clip_textual.onnx\",\n", " model.logit_scale.exp())\n", "onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # GPU mode" ], "metadata": { "id": "PsDS7ty79zZf" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "onnx_model.visual_session.get_providers()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aZsGJNrbNCYe", "outputId": "b0ee40a7-2ece-4e88-9e35-9ed0a735c533" }, "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['CUDAExecutionProvider', 'CPUExecutionProvider']" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "markdown", "source": [ "## Benchmark" ], "metadata": { "id": "J5IcOG_6jAFz" } }, { "cell_type": "code", "source": [ "model, preprocess = clip.load(\"ViT-B/32\", device=\"cuda\", jit=False)" ], "metadata": { "id": "SJ_5_x7vLepK" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "model.eval()\n", "for x in model.parameters():\n", " x.requires_grad = False" ], "metadata": { "id": "OnOzZ3LMuubW" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "import numpy, random, torch" ], "metadata": { "id": "wDwqRRrTGKUS" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "def set_seed():\n", " torch.manual_seed(12)\n", " torch.cuda.manual_seed(12)\n", " np.random.seed(12)\n", " random.seed(12)\n", "\n", " torch.backends.cudnn.deterministic=True" ], "metadata": { "id": "9H17n_6gGJgT" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "import torch\n", "import time\n", "\n", "n = 5\n", "clip_results = {\"encode_image\": [],\n", " \"encode_text\": []}\n", "onnx_results = {\"encode_image\": [],\n", " \"encode_text\": []}\n", "for batch in [2, 8, 16, 32, 64]:\n", " set_seed()\n", " t_mean = []\n", " for _ in range(n):\n", " image_input = torch.randint(1, 255, (batch, 3, 224, 224))\n", " image_input_onnx = image_input.detach().cpu().numpy().astype(np.float32)\n", " t = time.time()\n", " onnx_model.encode_image(image_input_onnx)\n", " t_mean.append(time.time() - t)\n", " print(\"onnx\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " onnx_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " set_seed()\n", " with torch.inference_mode():\n", " t_mean = []\n", " for _ in range(n):\n", " image_input = torch.randint(1, 255, (batch, 3, 224, 224)).cuda()\n", " t = time.time()\n", " model.encode_image(image_input)\n", " t_mean.append(time.time() - t)\n", " print(\"torch\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " clip_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " set_seed()\n", " t_mean = []\n", " for _ in range(n):\n", " text_input = torch.randint(320, 49407, (batch, 77))\n", " text_input_onnx = text_input.detach().cpu().numpy().astype(np.int32)\n", " t = time.time()\n", " onnx_model.encode_text(text_input_onnx)\n", " t_mean.append(time.time() - t)\n", " print(\"onnx\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " onnx_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " set_seed()\n", " with torch.inference_mode():\n", " t_mean = []\n", " for _ in range(n):\n", " text_input = torch.randint(320, 49407, (batch, 77)).cuda()\n", " t = time.time()\n", " model.encode_text(text_input)\n", " t_mean.append(time.time() - t)\n", " print(\"torch\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " clip_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " print(\"-\" * 78)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4lFL6tzWjiWL", "outputId": "ccaa7e0a-96f3-4a51-c4bd-c442aa13763c" }, "execution_count": 12, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "onnx 2 encode_image 0.136\n", "torch 2 encode_image 0.02\n", "onnx 2 encode_text 0.021\n", "torch 2 encode_text 0.035\n", "------------------------------------------------------------------------------\n", "onnx 8 encode_image 0.054\n", "torch 8 encode_image 0.081\n", "onnx 8 encode_text 0.04\n", "torch 8 encode_text 0.098\n", "------------------------------------------------------------------------------\n", "onnx 16 encode_image 0.089\n", "torch 16 encode_image 0.207\n", "onnx 16 encode_text 0.071\n", "torch 16 encode_text 0.196\n", "------------------------------------------------------------------------------\n", "onnx 32 encode_image 0.158\n", "torch 32 encode_image 0.44\n", "onnx 32 encode_text 0.134\n", "torch 32 encode_text 0.374\n", "------------------------------------------------------------------------------\n", "onnx 64 encode_image 0.325\n", "torch 64 encode_image 0.919\n", "onnx 64 encode_text 0.258\n", "torch 64 encode_text 0.719\n", "------------------------------------------------------------------------------\n" ] } ] }, { "cell_type": "code", "source": [ "import pandas as pd" ], "metadata": { "id": "P2YhbE9v_4ci" }, "execution_count": 13, "outputs": [] }, { "cell_type": "code", "source": [ "pd.DataFrame({\"backend\": [\"onnx\", \"torch\"] * 5,\n", " \"batch\": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],\n", " \"encode_image\": [j[1] for i in zip(onnx_results[\"encode_image\"],\n", " clip_results[\"encode_image\"]) for j in i],\n", " \"encode_text\": [j[1] for i in zip(onnx_results[\"encode_text\"],\n", " clip_results[\"encode_text\"]) for j in i]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 362 }, "id": "WfZfDk4PAlqm", "outputId": "78a5cae8-68ee-4edd-f34d-ccf7d3d8a23b" }, "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " backend batch encode_image encode_text\n", "0 onnx 2 0.136 0.021\n", "1 torch 2 0.020 0.035\n", "2 onnx 8 0.054 0.040\n", "3 torch 8 0.081 0.098\n", "4 onnx 16 0.089 0.071\n", "5 torch 16 0.207 0.196\n", "6 onnx 32 0.158 0.134\n", "7 torch 32 0.440 0.374\n", "8 onnx 64 0.325 0.258\n", "9 torch 64 0.919 0.719" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
backendbatchencode_imageencode_text
0onnx20.1360.021
1torch20.0200.035
2onnx80.0540.040
3torch80.0810.098
4onnx160.0890.071
5torch160.2070.196
6onnx320.1580.134
7torch320.4400.374
8onnx640.3250.258
9torch640.9190.719
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "onnx_df = pd.DataFrame({\"ONNX\": [\"ViT-B/32\"] * 5,\n", " \"batch\": [2, 8, 16, 32, 64],\n", " \"encode_image\": [i[1] for i in onnx_results[\"encode_image\"]],\n", " \"encode_text\": [i[1] for i in onnx_results[\"encode_text\"]]})\n", "onnx_df[\"total\"] = onnx_df[\"encode_image\"] + onnx_df[\"encode_text\"]" ], "metadata": { "id": "Xpw9lV7yBbA8" }, "execution_count": 15, "outputs": [] }, { "cell_type": "code", "source": [ "onnx_df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "LItAyQkeDhnQ", "outputId": "f9c1860c-e405-4d41-e530-d2b0027f1fd0" }, "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ONNX batch encode_image encode_text total\n", "0 ViT-B/32 2 0.136 0.021 0.157\n", "1 ViT-B/32 8 0.054 0.040 0.094\n", "2 ViT-B/32 16 0.089 0.071 0.160\n", "3 ViT-B/32 32 0.158 0.134 0.292\n", "4 ViT-B/32 64 0.325 0.258 0.583" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ONNXbatchencode_imageencode_texttotal
0ViT-B/3220.1360.0210.157
1ViT-B/3280.0540.0400.094
2ViT-B/32160.0890.0710.160
3ViT-B/32320.1580.1340.292
4ViT-B/32640.3250.2580.583
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "code", "source": [ "print(onnx_df.to_markdown(index=False))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AIQDA9FaJZ7Y", "outputId": "36aa68bb-8ebb-47de-d2b4-b8ce36cacfd7" }, "execution_count": 17, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "| ONNX | batch | encode_image | encode_text | total |\n", "|:---------|--------:|---------------:|--------------:|--------:|\n", "| ViT-B/32 | 2 | 0.136 | 0.021 | 0.157 |\n", "| ViT-B/32 | 8 | 0.054 | 0.04 | 0.094 |\n", "| ViT-B/32 | 16 | 0.089 | 0.071 | 0.16 |\n", "| ViT-B/32 | 32 | 0.158 | 0.134 | 0.292 |\n", "| ViT-B/32 | 64 | 0.325 | 0.258 | 0.583 |\n" ] } ] }, { "cell_type": "code", "source": [ "clip_df = pd.DataFrame({\"TORCH\": [\"ViT-B/32\"] * 5,\n", " \"batch\": [2, 8, 16, 32, 64],\n", " \"encode_image\": [i[1] for i in clip_results[\"encode_image\"]],\n", " \"encode_text\": [i[1] for i in clip_results[\"encode_text\"]]})\n", "clip_df[\"total\"] = clip_df[\"encode_image\"] + clip_df[\"encode_text\"]" ], "metadata": { "id": "E1OXQUDvDZmI" }, "execution_count": 18, "outputs": [] }, { "cell_type": "code", "source": [ "print(clip_df.to_markdown(index=False))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xAj-ynhCDpPO", "outputId": "6f31dab3-8b2a-4b64-ed97-2ac309d6d749" }, "execution_count": 19, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "| TORCH | batch | encode_image | encode_text | total |\n", "|:---------|--------:|---------------:|--------------:|--------:|\n", "| ViT-B/32 | 2 | 0.02 | 0.035 | 0.055 |\n", "| ViT-B/32 | 8 | 0.081 | 0.098 | 0.179 |\n", "| ViT-B/32 | 16 | 0.207 | 0.196 | 0.403 |\n", "| ViT-B/32 | 32 | 0.44 | 0.374 | 0.814 |\n", "| ViT-B/32 | 64 | 0.919 | 0.719 | 1.638 |\n" ] } ] } ] } ================================================ FILE: examples/dev/clip_onnx_benchmark_gpu_T4.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "clip-onnx-benchmark-gpu-T4.ipynb", "provenance": [], "authorship_tag": "ABX9TyNqeHpYdbkhiqZatysOn5ch", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "## Restart colab session after installation\n", "Reload the session if something doesn't work" ], "metadata": { "id": "fxPg_VvZuScV" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "al_QNjyFq6Jj" }, "outputs": [], "source": [ "%%capture\n", "!pip install git+https://github.com/Lednik7/CLIP-ONNX.git@dev\n", "!pip install git+https://github.com/openai/CLIP.git\n", "!pip install onnxruntime-gpu" ] }, { "cell_type": "code", "source": [ "%%capture\n", "!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true" ], "metadata": { "id": "42eeJz9lTdJ6" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "!nvidia-smi" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XuauIZIBSEUX", "outputId": "3e459c2c-8f31-4aff-c288-f2e6c4684e36" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Tue May 3 07:10:09 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", "| N/A 38C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ] } ] }, { "cell_type": "code", "source": [ "import onnxruntime\n", "print(onnxruntime.get_device())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gqvxpdajRX5_", "outputId": "48a89abb-a326-4563-f99a-40c7d25145af" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "GPU\n" ] } ] }, { "cell_type": "markdown", "source": [ "## GPU inference mode\n", "Select a runtime GPU to continue:\n", "\n", "Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU" ], "metadata": { "id": "010k-ksVTjAu" } }, { "cell_type": "markdown", "source": [ "### Torch CLIP" ], "metadata": { "id": "KdTz0IJWVBqE" } }, { "cell_type": "code", "source": [ "import clip\n", "from PIL import Image\n", "import numpy as np\n", "\n", "# onnx cannot work with cuda\n", "model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n", "\n", "# batch first\n", "image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0) # [1, 3, 224, 224]\n", "image_onnx = image.detach().cpu().numpy().astype(np.float32)\n", "\n", "# batch first\n", "text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]) # [3, 77]\n", "text_onnx = text.detach().cpu().numpy().astype(np.int32)" ], "metadata": { "id": "9ROPwKYurOhP" }, "execution_count": 3, "outputs": [] }, { "cell_type": "markdown", "source": [ "### CLIP-ONNX" ], "metadata": { "id": "Ao2MriaVVG6Y" } }, { "cell_type": "code", "source": [ "from clip_onnx import clip_onnx\n", "\n", "onnx_model = clip_onnx(model)\n", "onnx_model.convert2onnx(image, text, verbose=True)\n", "# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n", "onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # GPU mode" ], "metadata": { "id": "nSeG9uAZrcph", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "1186f909-6cfb-400b-c2d9-3dddc93d318b" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start convert visual model\n", "[CLIP ONNX] Start check visual model\n", "[CLIP ONNX] Start convert textual model\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_opset9.py:2909: UserWarning: Exporting aten::index operator of advanced indexing in opset 12 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n", " \"If indices include negative values, the exported graph will produce incorrect results.\")\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start check textual model\n", "[CLIP ONNX] Models converts successfully\n" ] } ] }, { "cell_type": "code", "source": [ "onnx_model = clip_onnx(model)\n", "onnx_model.load_onnx(\"/content/clip_visual.onnx\",\n", " \"/content/clip_textual.onnx\",\n", " model.logit_scale.exp())\n", "onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # GPU mode" ], "metadata": { "id": "PsDS7ty79zZf" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "onnx_model.visual_session.get_providers()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aZsGJNrbNCYe", "outputId": "05464d1a-7047-4efd-80fe-32870cf34afd" }, "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['CUDAExecutionProvider', 'CPUExecutionProvider']" ] }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "markdown", "source": [ "## Benchmark" ], "metadata": { "id": "J5IcOG_6jAFz" } }, { "cell_type": "code", "source": [ "model, preprocess = clip.load(\"ViT-B/32\", device=\"cuda\", jit=False)" ], "metadata": { "id": "SJ_5_x7vLepK" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "model.eval()\n", "for x in model.parameters():\n", " x.requires_grad = False" ], "metadata": { "id": "OnOzZ3LMuubW" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "import numpy, random, torch" ], "metadata": { "id": "wDwqRRrTGKUS" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "def set_seed():\n", " torch.manual_seed(12)\n", " torch.cuda.manual_seed(12)\n", " np.random.seed(12)\n", " random.seed(12)\n", "\n", " torch.backends.cudnn.deterministic=True" ], "metadata": { "id": "9H17n_6gGJgT" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "import torch\n", "import time\n", "\n", "n = 5\n", "clip_results = {\"encode_image\": [],\n", " \"encode_text\": []}\n", "onnx_results = {\"encode_image\": [],\n", " \"encode_text\": []}\n", "for batch in [2, 8, 16, 32, 64]:\n", " set_seed()\n", " t_mean = []\n", " for _ in range(n):\n", " image_input = torch.randint(1, 255, (batch, 3, 224, 224))\n", " image_input_onnx = image_input.detach().cpu().numpy().astype(np.float32)\n", " t = time.time()\n", " onnx_model.encode_image(image_input_onnx)\n", " t_mean.append(time.time() - t)\n", " print(\"onnx\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " onnx_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " set_seed()\n", " with torch.inference_mode():\n", " t_mean = []\n", " for _ in range(n):\n", " image_input = torch.randint(1, 255, (batch, 3, 224, 224)).cuda()\n", " t = time.time()\n", " model.encode_image(image_input)\n", " t_mean.append(time.time() - t)\n", " print(\"torch\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " clip_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " set_seed()\n", " t_mean = []\n", " for _ in range(n):\n", " text_input = torch.randint(320, 49407, (batch, 77))\n", " text_input_onnx = text_input.detach().cpu().numpy().astype(np.int32)\n", " t = time.time()\n", " onnx_model.encode_text(text_input_onnx)\n", " t_mean.append(time.time() - t)\n", " print(\"onnx\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " onnx_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " set_seed()\n", " with torch.inference_mode():\n", " t_mean = []\n", " for _ in range(n):\n", " text_input = torch.randint(320, 49407, (batch, 77)).cuda()\n", " t = time.time()\n", " model.encode_text(text_input)\n", " t_mean.append(time.time() - t)\n", " print(\"torch\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n", " torch.cuda.empty_cache()\n", " clip_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n", "\n", " print(\"-\" * 78)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4lFL6tzWjiWL", "outputId": "c2b9f0e4-9b93-408b-96bf-3fdb3057e15b" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "onnx 2 encode_image 0.155\n", "torch 2 encode_image 0.017\n", "onnx 2 encode_text 0.01\n", "torch 2 encode_text 0.009\n", "------------------------------------------------------------------------------\n", "onnx 8 encode_image 0.032\n", "torch 8 encode_image 0.008\n", "onnx 8 encode_text 0.014\n", "torch 8 encode_text 0.008\n", "------------------------------------------------------------------------------\n", "onnx 16 encode_image 0.037\n", "torch 16 encode_image 0.009\n", "onnx 16 encode_text 0.029\n", "torch 16 encode_text 0.012\n", "------------------------------------------------------------------------------\n", "onnx 32 encode_image 0.076\n", "torch 32 encode_image 0.008\n", "onnx 32 encode_text 0.059\n", "torch 32 encode_text 0.025\n", "------------------------------------------------------------------------------\n", "onnx 64 encode_image 0.169\n", "torch 64 encode_image 0.009\n", "onnx 64 encode_text 0.117\n", "torch 64 encode_text 0.049\n", "------------------------------------------------------------------------------\n" ] } ] }, { "cell_type": "code", "source": [ "import pandas as pd" ], "metadata": { "id": "P2YhbE9v_4ci" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "pd.DataFrame({\"backend\": [\"onnx\", \"torch\"] * 5,\n", " \"batch\": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],\n", " \"encode_image\": [j[1] for i in zip(onnx_results[\"encode_image\"],\n", " clip_results[\"encode_image\"]) for j in i],\n", " \"encode_text\": [j[1] for i in zip(onnx_results[\"encode_text\"],\n", " clip_results[\"encode_text\"]) for j in i]})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 362 }, "id": "WfZfDk4PAlqm", "outputId": "3375eac7-47b0-40ba-c2d6-c30fda2ab6d5" }, "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " backend batch encode_image encode_text\n", "0 onnx 2 0.155 0.010\n", "1 torch 2 0.017 0.009\n", "2 onnx 8 0.032 0.014\n", "3 torch 8 0.008 0.008\n", "4 onnx 16 0.037 0.029\n", "5 torch 16 0.009 0.012\n", "6 onnx 32 0.076 0.059\n", "7 torch 32 0.008 0.025\n", "8 onnx 64 0.169 0.117\n", "9 torch 64 0.009 0.049" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
backendbatchencode_imageencode_text
0onnx20.1550.010
1torch20.0170.009
2onnx80.0320.014
3torch80.0080.008
4onnx160.0370.029
5torch160.0090.012
6onnx320.0760.059
7torch320.0080.025
8onnx640.1690.117
9torch640.0090.049
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "onnx_df = pd.DataFrame({\"ONNX\": [\"ViT-B/32\"] * 5,\n", " \"batch\": [2, 8, 16, 32, 64],\n", " \"encode_image\": [i[1] for i in onnx_results[\"encode_image\"]],\n", " \"encode_text\": [i[1] for i in onnx_results[\"encode_text\"]]})\n", "onnx_df[\"total\"] = onnx_df[\"encode_image\"] + onnx_df[\"encode_text\"]" ], "metadata": { "id": "Xpw9lV7yBbA8" }, "execution_count": 14, "outputs": [] }, { "cell_type": "code", "source": [ "onnx_df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "LItAyQkeDhnQ", "outputId": "e6c88747-5eba-4c16-be40-d4de584f429e" }, "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ONNX batch encode_image encode_text total\n", "0 ViT-B/32 2 0.155 0.010 0.165\n", "1 ViT-B/32 8 0.032 0.014 0.046\n", "2 ViT-B/32 16 0.037 0.029 0.066\n", "3 ViT-B/32 32 0.076 0.059 0.135\n", "4 ViT-B/32 64 0.169 0.117 0.286" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ONNXbatchencode_imageencode_texttotal
0ViT-B/3220.1550.0100.165
1ViT-B/3280.0320.0140.046
2ViT-B/32160.0370.0290.066
3ViT-B/32320.0760.0590.135
4ViT-B/32640.1690.1170.286
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "print(onnx_df.to_markdown(index=False))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AIQDA9FaJZ7Y", "outputId": "8b197c3c-63d1-42c4-8ca3-a3258acfc878" }, "execution_count": 16, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "| ONNX | batch | encode_image | encode_text | total |\n", "|:---------|--------:|---------------:|--------------:|--------:|\n", "| ViT-B/32 | 2 | 0.155 | 0.01 | 0.165 |\n", "| ViT-B/32 | 8 | 0.032 | 0.014 | 0.046 |\n", "| ViT-B/32 | 16 | 0.037 | 0.029 | 0.066 |\n", "| ViT-B/32 | 32 | 0.076 | 0.059 | 0.135 |\n", "| ViT-B/32 | 64 | 0.169 | 0.117 | 0.286 |\n" ] } ] }, { "cell_type": "code", "source": [ "clip_df = pd.DataFrame({\"TORCH\": [\"ViT-B/32\"] * 5,\n", " \"batch\": [2, 8, 16, 32, 64],\n", " \"encode_image\": [i[1] for i in clip_results[\"encode_image\"]],\n", " \"encode_text\": [i[1] for i in clip_results[\"encode_text\"]]})\n", "clip_df[\"total\"] = clip_df[\"encode_image\"] + clip_df[\"encode_text\"]" ], "metadata": { "id": "E1OXQUDvDZmI" }, "execution_count": 17, "outputs": [] }, { "cell_type": "code", "source": [ "print(clip_df.to_markdown(index=False))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "xAj-ynhCDpPO", "outputId": "f90bc132-4727-45df-a6c2-49e2a68e0a4a" }, "execution_count": 18, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "| TORCH | batch | encode_image | encode_text | total |\n", "|:---------|--------:|---------------:|--------------:|--------:|\n", "| ViT-B/32 | 2 | 0.017 | 0.009 | 0.026 |\n", "| ViT-B/32 | 8 | 0.008 | 0.008 | 0.016 |\n", "| ViT-B/32 | 16 | 0.009 | 0.012 | 0.021 |\n", "| ViT-B/32 | 32 | 0.008 | 0.025 | 0.033 |\n", "| ViT-B/32 | 64 | 0.009 | 0.049 | 0.058 |\n" ] } ] } ] } ================================================ FILE: examples/readme_example.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "readme_example.ipynb", "provenance": [], "authorship_tag": "ABX9TyPpME0Qdi/m3VZQ+jNj39dT", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "## Restart colab session after installation\n", "Reload the session if something doesn't work" ], "metadata": { "id": "whlsBiJgR8le" } }, { "cell_type": "code", "source": [ "%%capture\n", "!pip install git+https://github.com/Lednik7/CLIP-ONNX.git\n", "!pip install git+https://github.com/openai/CLIP.git\n", "!pip install onnxruntime-gpu" ], "metadata": { "id": "HnbpAkvuR73L" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "%%capture\n", "!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true" ], "metadata": { "id": "tqy0zKM4R-7M" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "!nvidia-smi # CPU Provider" ], "metadata": { "id": "eKqETHL4YscZ", "outputId": "7ff0bc18-fb40-4296-ab05-b079043e46a1", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "import onnxruntime\n", "\n", "print(onnxruntime.get_device()) # priority device" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "x8IN72OnSAIh", "outputId": "81d14047-91fa-4a5c-a1e3-f5b550556591" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "CPU\n" ] } ] }, { "cell_type": "markdown", "source": [ "## CPU inference mode" ], "metadata": { "id": "U1Pr-YTtSEhs" } }, { "cell_type": "code", "source": [ "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\", category=UserWarning)" ], "metadata": { "id": "gZTxanR26knr" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "import clip\n", "from PIL import Image\n", "import numpy as np\n", "\n", "# onnx cannot export with cuda\n", "model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n", "\n", "# batch first\n", "image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0).cpu() # [1, 3, 224, 224]\n", "image_onnx = image.detach().cpu().numpy().astype(np.float32)\n", "\n", "# batch first\n", "text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]).cpu() # [3, 77]\n", "text_onnx = text.detach().cpu().numpy().astype(np.int32)" ], "metadata": { "id": "rPwc6A2SSGyl" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "from clip_onnx import clip_onnx, attention\n", "# clip.model.ResidualAttentionBlock.attention = attention\n", "\n", "visual_path = \"clip_visual.onnx\"\n", "textual_path = \"clip_textual.onnx\"\n", "\n", "onnx_model = clip_onnx(model, visual_path=visual_path, textual_path=textual_path)\n", "onnx_model.convert2onnx(image, text, verbose=True)\n", "# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n", "onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # cpu mode" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "oYM5FDSGSJBW", "outputId": "816705b1-3829-4424-c7c4-5426cf21cc18" }, "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[CLIP ONNX] Start convert visual model\n", "[CLIP ONNX] Start check visual model\n", "[CLIP ONNX] Start convert textual model\n", "[CLIP ONNX] Start check textual model\n", "[CLIP ONNX] Models converts successfully\n" ] } ] }, { "cell_type": "code", "source": [ "image_features = onnx_model.encode_image(image_onnx)\n", "text_features = onnx_model.encode_text(text_onnx)\n", "\n", "logits_per_image, logits_per_text = onnx_model(image_onnx, text_onnx)\n", "probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()\n", "\n", "print(\"Label probs:\", probs) # prints: [[0.9927937 0.00421067 0.00299571]]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tYVuk72nSLw6", "outputId": "41608059-3732-4ea7-c619-66f803af4185" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Label probs: [[0.9927937 0.00421067 0.00299571]]\n" ] } ] } ] } ================================================ FILE: examples/ru_CLIP_tiny_onnx.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "ru_CLIP_tiny_onnx.ipynb", "provenance": [], "collapsed_sections": [ "WWXCt_2NLhN_", "PHb4CAoRL3qC", "re2sSYAYO3D-", "ithu4-z0PIm5", "FWm0GAhWPzSW" ], "machine_shape": "hm" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "5319c7971f234d4bb615508f76475f9e": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_c43027a0735e459ca1f710e5a9c43177", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_c00c959249db4f2a9b97adabd2684c3c", "IPY_MODEL_d9e4edd05c1e40f991eb2c2f1fc9ebc1", "IPY_MODEL_ab4928c0a86449a384e36d8c0bc25717" ] } }, "c43027a0735e459ca1f710e5a9c43177": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "c00c959249db4f2a9b97adabd2684c3c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_3251223dac8f43c081701ff7f663cb35", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": "Downloading: 100%", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_d63d5559ce534b86969132d3ff8d875b" } }, "d9e4edd05c1e40f991eb2c2f1fc9ebc1": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_59618f021fc4495e9c401a421d28d4a0", "_dom_classes": [], "description": "", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 381781, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 381781, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_c56ba935682647dca4bdcc593fe0d2cc" } }, "ab4928c0a86449a384e36d8c0bc25717": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_01808c7fec8447368d60a33b2d683851", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 373k/373k [00:00<00:00, 876kB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_f9466e0349c84633a0fb8ceeffa2a984" } }, "3251223dac8f43c081701ff7f663cb35": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "d63d5559ce534b86969132d3ff8d875b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "59618f021fc4495e9c401a421d28d4a0": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "c56ba935682647dca4bdcc593fe0d2cc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "01808c7fec8447368d60a33b2d683851": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "f9466e0349c84633a0fb8ceeffa2a984": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "10ee9777b41e42129e2c9cc9327ad88f": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_cb6a647757244da3941602127ec38ccb", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_a64a223312144f2f9736729b63ab1ce5", "IPY_MODEL_7e7bce13eeed41179e4e15fc7afc89d5", "IPY_MODEL_7bacd13c23cf415fa5d58e9243c4a785" ] } }, "cb6a647757244da3941602127ec38ccb": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "a64a223312144f2f9736729b63ab1ce5": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_9fe6e1167e5d45fbad2adab3d59e017d", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": "Downloading: 100%", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_655c507d8fcf423f8bd6746201f569ae" } }, "7e7bce13eeed41179e4e15fc7afc89d5": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_8c4812afaaec4d65bf84a1e77840d356", "_dom_classes": [], "description": "", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 112, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 112, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_64dfa71e3dff4236908e0592e4f90250" } }, "7bacd13c23cf415fa5d58e9243c4a785": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_ec8d98c1edb148d3ae1c518b61e8155b", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 112/112 [00:00<00:00, 3.41kB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_a8212828565d4c9884d44fb45dc51ee5" } }, "9fe6e1167e5d45fbad2adab3d59e017d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "655c507d8fcf423f8bd6746201f569ae": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "8c4812afaaec4d65bf84a1e77840d356": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "64dfa71e3dff4236908e0592e4f90250": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "ec8d98c1edb148d3ae1c518b61e8155b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "a8212828565d4c9884d44fb45dc51ee5": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "9a2d4d7da3024cc0828b1a6dafd0dd16": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_fe4daa4d7d024187aa2f622dbf3577a8", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_8380bf9a899645e8aef576e640b41ea2", "IPY_MODEL_37c593d2f442497483cd0026498bab05", "IPY_MODEL_3cc7b132c94f427ba44858e4c4ce3019" ] } }, "fe4daa4d7d024187aa2f622dbf3577a8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "8380bf9a899645e8aef576e640b41ea2": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_d2aed3c0f95b4677bd6e949a4ed0403e", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": "Downloading: 100%", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_bbc52e0e0b2f4758bd7d6cf44b4670ae" } }, "37c593d2f442497483cd0026498bab05": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_8513d262d8764d99aa5d3f2f178b875e", "_dom_classes": [], "description": "", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 239, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 239, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_57984bbd46a84a7bb2b7629e6b2f9ef9" } }, "3cc7b132c94f427ba44858e4c4ce3019": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_b886e2b6bbcd46cf806ff3a0b3cb8d33", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 239/239 [00:00<00:00, 5.49kB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_21ac91113f7e4548b416a32b1b3f66a9" } }, "d2aed3c0f95b4677bd6e949a4ed0403e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "bbc52e0e0b2f4758bd7d6cf44b4670ae": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "8513d262d8764d99aa5d3f2f178b875e": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "57984bbd46a84a7bb2b7629e6b2f9ef9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "b886e2b6bbcd46cf806ff3a0b3cb8d33": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "21ac91113f7e4548b416a32b1b3f66a9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "f8958c6de2394fecab9f95388a365431": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_11a8a4b2d39d4ea8904c0f1b2f6dd906", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_9657501af7514a60b30fcd60a223980c", "IPY_MODEL_61274b2bac5e4835a8bd33dc201bc155", "IPY_MODEL_973300b095554b10ac290244772e0a6f" ] } }, "11a8a4b2d39d4ea8904c0f1b2f6dd906": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "9657501af7514a60b30fcd60a223980c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_c3f5f56bb14d44b6a5775a77f6763b94", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": "Downloading: 100%", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_fbf430940c8a49949953155b57d07766" } }, "61274b2bac5e4835a8bd33dc201bc155": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_ab05641bcb9c49aab977110fab503a78", "_dom_classes": [], "description": "", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 175, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 175, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_4f11a71d7df943e48ac9ea3bab5c6771" } }, "973300b095554b10ac290244772e0a6f": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_b6004e09152045e18503cf75e32d4fa6", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 175/175 [00:00<00:00, 5.41kB/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_590fb707d26948b5b9c8bb3b896f29e1" } }, "c3f5f56bb14d44b6a5775a77f6763b94": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "fbf430940c8a49949953155b57d07766": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "ab05641bcb9c49aab977110fab503a78": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "4f11a71d7df943e48ac9ea3bab5c6771": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "b6004e09152045e18503cf75e32d4fa6": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "590fb707d26948b5b9c8bb3b896f29e1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } } }, "cells": [ { "cell_type": "markdown", "source": [ "# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/cene555/ru-clip-tiny/blob/main/notebooks/ru_CLIP_tiny_onnx.ipynb)" ], "metadata": { "id": "JsWuTduwaagq" } }, { "cell_type": "markdown", "source": [ "## Select a runtime GPU to continue:\n", "\n", "Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU" ], "metadata": { "id": "VCCzmQdKJPkv" } }, { "cell_type": "code", "source": [ "#@title Allowed Resources\n", "import multiprocessing\n", "import torch\n", "from psutil import virtual_memory\n", "\n", "ram_gb = round(virtual_memory().total / 1024**3, 1)\n", "\n", "print('CPU:', multiprocessing.cpu_count())\n", "print('RAM GB:', ram_gb)\n", "print(\"PyTorch version:\", torch.__version__)\n", "print(\"CUDA version:\", torch.version.cuda)\n", "print(\"cuDNN version:\", torch.backends.cudnn.version())\n", "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", "print(\"device:\", device.type)\n", "\n", "!nvidia-smi" ], "metadata": { "cellView": "form", "id": "6xdy_cPJEYXV", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "9b5c5751-3377-4623-fd90-f59c21118c80" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "CPU: 2\n", "RAM GB: 12.7\n", "PyTorch version: 1.10.0+cu111\n", "CUDA version: 11.1\n", "cuDNN version: 8005\n", "device: cuda\n", "Tue Feb 1 17:26:24 2022 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 495.46 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", "| N/A 61C P8 11W / 70W | 3MiB / 15109MiB | 0% Default |\n", "| | | N/A |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Restart colab session after installation\n", "Reload session if something doesn't work (may need multiple times)" ], "metadata": { "id": "hmNP7iJBj6XZ" } }, { "cell_type": "markdown", "source": [ "## Install requirements" ], "metadata": { "id": "WWXCt_2NLhN_" } }, { "cell_type": "code", "source": [ "%%capture\n", "!gdown -O ru-clip-tiny.pkl https://drive.google.com/uc?id=1-3g3J90pZmHo9jbBzsEmr7ei5zm3VXOL\n", "\n", "!pip install git+https://github.com/cene555/ru-clip-tiny.git\n", "!pip install git+https://github.com/Lednik7/CLIP-ONNX.git\n", "!pip install onnxruntime-gpu\n", "\n", "!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true" ], "metadata": { "id": "FWEEtd7Vryaf" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import onnxruntime\n", "\n", "# priority device (if available)\n", "print(onnxruntime.get_device())" ], "metadata": { "id": "bUFx02Dhjap4", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "f595c387-da47-47e5-f96a-2d84adf3286b" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "GPU\n" ] } ] }, { "cell_type": "markdown", "source": [ "## Import libraries" ], "metadata": { "id": "PHb4CAoRL3qC" } }, { "cell_type": "code", "source": [ "import torch\n", "from rucliptiny import RuCLIPtiny\n", "from rucliptiny.utils import get_transform\n", "from rucliptiny.tokenizer import Tokenizer" ], "metadata": { "id": "cznZ7ozDL5-M" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\", category=UserWarning)" ], "metadata": { "id": "57COx0BKCmFA" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Load model" ], "metadata": { "id": "ithu4-z0PIm5" } }, { "cell_type": "code", "source": [ "#@title speed_test function\n", "\n", "import time\n", "\n", "def speed_test(func, data_gen, n=5, empty_cache=True, is_text=False,\n", " first_run=True):\n", " if empty_cache: torch.cuda.empty_cache()\n", " if first_run:\n", " if is_text:\n", " input_data1, input_data2 = data_gen()\n", " func(input_data1, input_data2)\n", " else:\n", " input_data = data_gen()\n", " func(input_data)\n", " torch.cuda.empty_cache()\n", " \n", " values = []\n", " for _ in range(n):\n", " if is_text:\n", " input_data1, input_data2 = data_gen()\n", " else:\n", " input_data = data_gen()\n", " if is_text:\n", " t = time.time()\n", " func(input_data1, input_data2)\n", " else:\n", " t = time.time()\n", " func(input_data)\n", " values.append(time.time() - t)\n", " if empty_cache: torch.cuda.empty_cache()\n", " return sum(values) / n" ], "metadata": { "id": "GqKM04tP4Vv3", "cellView": "form" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "torch.manual_seed(1)\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" ], "metadata": { "id": "SSOHYDRQGif-" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model = RuCLIPtiny()\n", "model.load_state_dict(torch.load('ru-clip-tiny.pkl',\n", " map_location=device))\n", "model = model.to(device).eval()\n", "for x in model.parameters(): x.requires_grad = False\n", "torch.cuda.empty_cache()" ], "metadata": { "id": "OpFAZfq-_nJe" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "transforms = get_transform()\n", "tokenizer = Tokenizer()" ], "metadata": { "id": "KEZj2WrwkzZz", "colab": { "base_uri": "https://localhost:8080/", "height": 145, "referenced_widgets": [ "5319c7971f234d4bb615508f76475f9e", "c43027a0735e459ca1f710e5a9c43177", "c00c959249db4f2a9b97adabd2684c3c", "d9e4edd05c1e40f991eb2c2f1fc9ebc1", "ab4928c0a86449a384e36d8c0bc25717", "3251223dac8f43c081701ff7f663cb35", "d63d5559ce534b86969132d3ff8d875b", "59618f021fc4495e9c401a421d28d4a0", "c56ba935682647dca4bdcc593fe0d2cc", "01808c7fec8447368d60a33b2d683851", "f9466e0349c84633a0fb8ceeffa2a984", "10ee9777b41e42129e2c9cc9327ad88f", "cb6a647757244da3941602127ec38ccb", "a64a223312144f2f9736729b63ab1ce5", "7e7bce13eeed41179e4e15fc7afc89d5", "7bacd13c23cf415fa5d58e9243c4a785", "9fe6e1167e5d45fbad2adab3d59e017d", "655c507d8fcf423f8bd6746201f569ae", "8c4812afaaec4d65bf84a1e77840d356", "64dfa71e3dff4236908e0592e4f90250", "ec8d98c1edb148d3ae1c518b61e8155b", "a8212828565d4c9884d44fb45dc51ee5", "9a2d4d7da3024cc0828b1a6dafd0dd16", "fe4daa4d7d024187aa2f622dbf3577a8", "8380bf9a899645e8aef576e640b41ea2", "37c593d2f442497483cd0026498bab05", "3cc7b132c94f427ba44858e4c4ce3019", "d2aed3c0f95b4677bd6e949a4ed0403e", "bbc52e0e0b2f4758bd7d6cf44b4670ae", "8513d262d8764d99aa5d3f2f178b875e", "57984bbd46a84a7bb2b7629e6b2f9ef9", "b886e2b6bbcd46cf806ff3a0b3cb8d33", "21ac91113f7e4548b416a32b1b3f66a9", "f8958c6de2394fecab9f95388a365431", "11a8a4b2d39d4ea8904c0f1b2f6dd906", "9657501af7514a60b30fcd60a223980c", "61274b2bac5e4835a8bd33dc201bc155", "973300b095554b10ac290244772e0a6f", "c3f5f56bb14d44b6a5775a77f6763b94", "fbf430940c8a49949953155b57d07766", "ab05641bcb9c49aab977110fab503a78", "4f11a71d7df943e48ac9ea3bab5c6771", "b6004e09152045e18503cf75e32d4fa6", "590fb707d26948b5b9c8bb3b896f29e1" ] }, "outputId": "466854ba-7fa2-4154-ada2-391626146c95" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5319c7971f234d4bb615508f76475f9e", "version_minor": 0, "version_major": 2 }, "text/plain": [ "Downloading: 0%| | 0.00/373k [00:00\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
backendbatchencode_imageencode_text
0onnx20.0110.001
1torch20.0180.003
2onnx80.0350.002
3torch80.0100.003
4onnx160.0700.004
5torch160.0100.003
6onnx320.1450.007
7torch320.0120.004
8onnx640.2940.014
9torch640.0130.005
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", " \n", " " ], "text/plain": [ " backend batch encode_image encode_text\n", "0 onnx 2 0.011 0.001\n", "1 torch 2 0.018 0.003\n", "2 onnx 8 0.035 0.002\n", "3 torch 8 0.010 0.003\n", "4 onnx 16 0.070 0.004\n", "5 torch 16 0.010 0.003\n", "6 onnx 32 0.145 0.007\n", "7 torch 32 0.012 0.004\n", "8 onnx 64 0.294 0.014\n", "9 torch 64 0.013 0.005" ] }, "metadata": {}, "execution_count": 21 } ] }, { "cell_type": "code", "source": [ "onnx_df = pd.DataFrame({\"ONNX\": [\"RuCLIPtiny\"] * 5,\n", " \"batch\": [2, 8, 16, 32, 64],\n", " \"encode_image\": [i[1] for i in onnx_results[\"encode_image\"]],\n", " \"encode_text\": [i[1] for i in onnx_results[\"encode_text\"]]})\n", "onnx_df[\"total\"] = onnx_df[\"encode_image\"] + onnx_df[\"encode_text\"]\n", "\n", "print(onnx_df.to_markdown(index=False))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ol9_RiUoG34e", "outputId": "82be9e0e-b92e-4e3c-8132-9269eb22a41d" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "| ONNX | batch | encode_image | encode_text | total |\n", "|:-----------|--------:|---------------:|--------------:|--------:|\n", "| RuCLIPtiny | 2 | 0.011 | 0.001 | 0.012 |\n", "| RuCLIPtiny | 8 | 0.035 | 0.002 | 0.037 |\n", "| RuCLIPtiny | 16 | 0.07 | 0.004 | 0.074 |\n", "| RuCLIPtiny | 32 | 0.145 | 0.007 | 0.152 |\n", "| RuCLIPtiny | 64 | 0.294 | 0.014 | 0.308 |\n" ] } ] }, { "cell_type": "code", "source": [ "clip_df = pd.DataFrame({\"TORCH\": [\"RuCLIPtiny\"] * 5,\n", " \"batch\": [2, 8, 16, 32, 64],\n", " \"encode_image\": [i[1] for i in clip_results[\"encode_image\"]],\n", " \"encode_text\": [i[1] for i in clip_results[\"encode_text\"]]})\n", "clip_df[\"total\"] = clip_df[\"encode_image\"] + clip_df[\"encode_text\"]\n", "print(clip_df.to_markdown(index=False))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qw8ZK9XeG4LY", "outputId": "326b24f9-9d21-47ed-d62c-d7594e786b96" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "| TORCH | batch | encode_image | encode_text | total |\n", "|:-----------|--------:|---------------:|--------------:|--------:|\n", "| RuCLIPtiny | 2 | 0.018 | 0.003 | 0.021 |\n", "| RuCLIPtiny | 8 | 0.01 | 0.003 | 0.013 |\n", "| RuCLIPtiny | 16 | 0.01 | 0.003 | 0.013 |\n", "| RuCLIPtiny | 32 | 0.012 | 0.004 | 0.016 |\n", "| RuCLIPtiny | 64 | 0.013 | 0.005 | 0.018 |\n" ] } ] } ] } ================================================ FILE: requirements.txt ================================================ torch==1.13.1 onnxruntime>=1.11.1 onnx>=1.11.0 ================================================ FILE: setup.py ================================================ import os import pkg_resources from setuptools import setup, find_packages with open("requirements.txt", "r") as f: install_requires = f.read().split("\n") setup( name="clip_onnx", version="1.2", py_modules=["clip_onnx, clip"], description="", author="Maxim Gerasimov", packages=find_packages(), install_requires=install_requires, include_package_data=True )