Repository: Lednik7/CLIP-ONNX
Branch: main
Commit: ebd4852b7d3e
Files: 19
Total size: 284.9 KB
Directory structure:
gitextract_k9qaok4o/
├── .gitignore
├── LICENSE
├── README.md
├── benchmark.md
├── clip_onnx/
│ ├── __init__.py
│ ├── benchmark.py
│ ├── clip_converter.py
│ ├── clip_onnx.py
│ └── utils.py
├── examples/
│ ├── RuCLIP_onnx_example.ipynb
│ ├── clip_onnx_example.ipynb
│ ├── dev/
│ │ ├── clip_onnx_benchmark_cpu.ipynb
│ │ ├── clip_onnx_benchmark_gpu.ipynb
│ │ ├── clip_onnx_benchmark_gpu_K80.ipynb
│ │ └── clip_onnx_benchmark_gpu_T4.ipynb
│ ├── readme_example.ipynb
│ └── ru_CLIP_tiny_onnx.ipynb
├── requirements.txt
└── setup.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2022 Gerasimov Maxim
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# CLIP-ONNX
It is a simple library to speed up CLIP inference up to 3x (K80 GPU)!
[](https://colab.research.google.com/github/Lednik7/CLIP-ONNX/blob/main/examples/readme_example.ipynb)
Open AI CLIP
[](https://colab.research.google.com/github/Lednik7/CLIP-ONNX/blob/main/examples/RuCLIP_onnx_example.ipynb)
RuCLIP Example
[](https://colab.research.google.com/github/Lednik7/CLIP-ONNX/blob/main/examples/ru_CLIP_tiny_onnx.ipynb)
RuCLIP tiny Example
## Usage
Install clip-onnx module and requirements first. Use this trick
```python3
!pip install git+https://github.com/Lednik7/CLIP-ONNX.git
!pip install git+https://github.com/openai/CLIP.git
!pip install onnxruntime-gpu
```
## Example in 3 steps
0. Download CLIP image from repo
```python3
!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true
```
1. Load standard CLIP model, image, text on cpu
```python3
import clip
from PIL import Image
import numpy as np
# onnx cannot work with cuda
model, preprocess = clip.load("ViT-B/32", device="cpu", jit=False)
# batch first
image = preprocess(Image.open("CLIP.png")).unsqueeze(0).cpu() # [1, 3, 224, 224]
image_onnx = image.detach().cpu().numpy().astype(np.float32)
# batch first
text = clip.tokenize(["a diagram", "a dog", "a cat"]).cpu() # [3, 77]
text_onnx = text.detach().cpu().numpy().astype(np.int32)
```
2. Create CLIP-ONNX object to convert model to onnx
```python3
from clip_onnx import clip_onnx
visual_path = "clip_visual.onnx"
textual_path = "clip_textual.onnx"
onnx_model = clip_onnx(model, visual_path=visual_path, textual_path=textual_path)
onnx_model.convert2onnx(image, text, verbose=True)
# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
onnx_model.start_sessions(providers=["CPUExecutionProvider"]) # cpu mode
```
3. Use for standard CLIP API. Batch inference
```python3
image_features = onnx_model.encode_image(image_onnx)
text_features = onnx_model.encode_text(text_onnx)
logits_per_image, logits_per_text = onnx_model(image_onnx, text_onnx)
probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()
print("Label probs:", probs) # prints: [[0.9927937 0.00421067 0.00299571]]
```
**Enjoy the speed**
## Load saved model
Example for ViT-B/32 from Model Zoo
```python3
!wget https://clip-as-service.s3.us-east-2.amazonaws.com/models/onnx/ViT-B-32/visual.onnx
!wget https://clip-as-service.s3.us-east-2.amazonaws.com/models/onnx/ViT-B-32/textual.onnx
```
```python3
onnx_model = clip_onnx(None)
onnx_model.load_onnx(visual_path="visual.onnx",
textual_path="textual.onnx",
logit_scale=100.0000) # model.logit_scale.exp()
onnx_model.start_sessions(providers=["CPUExecutionProvider"])
```
## Model Zoo
Models of the original CLIP can be found on this [page](https://github.com/jina-ai/clip-as-service/blob/main/server/clip_server/model/clip_onnx.py).\
They are not part of this library but should work correctly.
## If something doesn't work
It happens that onnx does not convert the model the first time, in these cases it is worth trying to run it again.
If it doesn't help, it makes sense to change the export settings.
Model export options in onnx looks like this:
```python3
DEFAULT_EXPORT = dict(input_names=['input'], output_names=['output'],
export_params=True, verbose=False, opset_version=12,
do_constant_folding=True,
dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}})
```
You can change them pretty easily.
```python3
from clip_onnx.utils import DEFAULT_EXPORT
DEFAULT_EXPORT["opset_version"] = 15
```
Alternative option (change only visual or textual):
```python3
from clip_onnx import clip_onnx
from clip_onnx.utils import DEFAULT_EXPORT
visual_path = "clip_visual.onnx"
textual_path = "clip_textual.onnx"
textual_export_params = DEFAULT_EXPORT.copy()
textual_export_params["dynamic_axes"] = {'input': {1: 'batch_size'},
'output': {0: 'batch_size'}}
textual_export_params["opset_version"] = 12
Textual = lambda x: x
onnx_model = clip_onnx(model.cpu(), visual_path=visual_path, textual_path=textual_path)
onnx_model.convert2onnx(dummy_input_image, dummy_input_text, verbose=True,
textual_wrapper=Textual,
textual_export_params=textual_export_params)
```
## Best practices
See [benchmark.md](https://github.com/Lednik7/CLIP-ONNX/tree/main/benchmark.md)
## Examples
See [examples folder](https://github.com/Lednik7/CLIP-ONNX/tree/main/examples) for more details \
Some parts of the code were taken from the [post](https://twitter.com/apeoffire/status/1478493291008172038). Thank you [neverix](https://github.com/neverix) for this notebook.
================================================
FILE: benchmark.md
================================================
# CPU benchmarks
#### Run on Intel (R) Xeon (R) CPU @ 2.30 GHz with 2 cores (Google Colab session)
| ONNX | batch | encode_image | encode_text | total |
|:---------|--------:|---------------:|--------------:|--------:|
| ViT-B/32 | 2 | 0.234 | 0.162 | 0.396 |
| ViT-B/32 | 8 | 0.923 | 0.656 | 1.579 |
| ViT-B/32 | 16 | 2.079 | 1.288 | 3.367 |
| ViT-B/32 | 32 | 3.937 | 2.658 | 6.595 |
| ViT-B/32 | 64 | 7.944 | 5.567 | 13.511 |
| TORCH | batch | encode_image | encode_text | total |
|:---------|--------:|---------------:|--------------:|--------:|
| ViT-B/32 | 2 | 0.343 | 0.243 | 0.586 |
| ViT-B/32 | 8 | 1.093 | 0.831 | 1.924 |
| ViT-B/32 | 16 | 1.952 | 1.523 | 3.475 |
| ViT-B/32 | 32 | 4.079 | 3.015 | 7.094 |
| ViT-B/32 | 64 | 8.07 | 6.212 | 14.282 |
# GPU benchmarks
#### Run on NVIDIA Tesla K80 (Google Colab session)
| ONNX | batch | encode_image | encode_text | total |
|:---------|--------:|---------------:|--------------:|--------:|
| ViT-B/32 | 2 | 0.136 | 0.021 | 0.157 |
| ViT-B/32 | 8 | 0.054 | 0.04 | 0.094 |
| ViT-B/32 | 16 | 0.089 | 0.071 | 0.16 |
| ViT-B/32 | 32 | 0.158 | 0.134 | 0.292 |
| ViT-B/32 | 64 | 0.325 | 0.258 | 0.583 |
| TORCH | batch | encode_image | encode_text | total |
|:---------|--------:|---------------:|--------------:|--------:|
| ViT-B/32 | 2 | 0.02 | 0.035 | 0.055 |
| ViT-B/32 | 8 | 0.081 | 0.098 | 0.179 |
| ViT-B/32 | 16 | 0.207 | 0.196 | 0.403 |
| ViT-B/32 | 32 | 0.44 | 0.374 | 0.814 |
| ViT-B/32 | 64 | 0.919 | 0.719 | 1.638 |
#### Run on NVIDIA Tesla T4 (Google Colab session)
| ONNX | batch | encode_image | encode_text | total |
|:---------|--------:|---------------:|--------------:|--------:|
| ViT-B/32 | 2 | 0.155 | 0.01 | 0.165 |
| ViT-B/32 | 8 | 0.032 | 0.014 | 0.046 |
| ViT-B/32 | 16 | 0.037 | 0.029 | 0.066 |
| ViT-B/32 | 32 | 0.076 | 0.059 | 0.135 |
| ViT-B/32 | 64 | 0.169 | 0.117 | 0.286 |
| TORCH | batch | encode_image | encode_text | total |
|:---------|--------:|---------------:|--------------:|--------:|
| ViT-B/32 | 2 | 0.017 | 0.009 | 0.026 |
| ViT-B/32 | 8 | 0.008 | 0.008 | 0.016 |
| ViT-B/32 | 16 | 0.009 | 0.012 | 0.021 |
| ViT-B/32 | 32 | 0.008 | 0.025 | 0.033 |
| ViT-B/32 | 64 | 0.009 | 0.049 | 0.058 |
================================================
FILE: clip_onnx/__init__.py
================================================
from .clip_converter import clip_converter
from .clip_onnx import clip_onnx
from .utils import Textual, attention
from .benchmark import speed_test
================================================
FILE: clip_onnx/benchmark.py
================================================
import time
import torch
def speed_test(func, data_gen, n: int = 5, empty_cache: bool = True):
if empty_cache:
torch.cuda.empty_cache()
values = []
for _ in range(n):
input_data = data_gen()
t = time.time()
func(input_data)
values.append(time.time() - t)
if empty_cache:
torch.cuda.empty_cache()
return sum(values) / n
================================================
FILE: clip_onnx/clip_converter.py
================================================
import torch
import onnx
from torch import nn
from onnxruntime.quantization import quantize_dynamic, QuantType
from .utils import Textual, DEFAULT_EXPORT
class clip_converter(nn.Module):
def __init__(self, model, visual_path: str = "clip_visual.onnx",
textual_path: str = "clip_textual.onnx"):
super().__init__()
self.model = model
self.visual_path = visual_path
self.textual_path = textual_path
self.visual_flag = False
self.textual_flag = False
self.logit_scale = self.model.logit_scale.exp()
self.model.eval()
for x in self.model.parameters():
x.requires_grad = False
def quantization(self, mode: str = "dynamic"):
assert mode in ["dynamic"]
if mode == "dynamic":
model_quant_visual = f"{self.visual_path}.quant"
quantize_dynamic(self.visual_path,
model_quant_visual,
weight_type=QuantType.QUInt8)
self.visual_path = model_quant_visual
model_quant_textual = f"{self.textual_path}.quant"
quantize_dynamic(self.textual_path,
model_quant_textual,
weight_type=QuantType.QUInt8)
self.textual_path = model_quant_textual
def torch_export(self, model, dummy_input, path: str, export_params=DEFAULT_EXPORT):
torch.onnx.export(model, dummy_input, path, **export_params)
def onnx_checker(self, path: str):
model = onnx.load(path)
onnx.checker.check_model(model)
del model
def convert_visual(self, dummy_input, wrapper=lambda x: x,
export_params=DEFAULT_EXPORT):
visual = wrapper(self.model.visual)
self.torch_export(visual, dummy_input, self.visual_path,
export_params=export_params)
self.onnx_checker(self.visual_path)
def convert_textual(self, dummy_input, wrapper=Textual,
export_params=DEFAULT_EXPORT):
textual = wrapper(self.model)
self.torch_export(textual, dummy_input, self.textual_path,
export_params=export_params)
self.onnx_checker(self.textual_path)
def convert2onnx(self, visual_input=None, textual_input=None, verbose=True,
visual_wrapper=lambda x: x,
textual_wrapper=Textual,
visual_export_params=DEFAULT_EXPORT,
textual_export_params=DEFAULT_EXPORT):
isinstance_visual_input = isinstance(visual_input, (torch.Tensor))
isinstance_textual_input = isinstance(textual_input, (torch.Tensor))
if (not isinstance_visual_input) and (not isinstance_textual_input):
raise Exception("[CLIP ONNX] Please, choose a dummy input")
elif not isinstance_visual_input:
print("[CLIP ONNX] Convert only textual model")
elif not isinstance_textual_input:
print("[CLIP ONNX] Convert only visual model")
if isinstance_visual_input:
self.visual_flag = True
if verbose:
print("[CLIP ONNX] Start convert visual model")
self.convert_visual(visual_input, visual_wrapper, visual_export_params)
if verbose:
print("[CLIP ONNX] Start check visual model")
self.onnx_checker(self.visual_path)
if isinstance_textual_input:
self.textual_flag = True
if verbose:
print("[CLIP ONNX] Start convert textual model")
self.convert_textual(textual_input, textual_wrapper, textual_export_params)
if verbose:
print("[CLIP ONNX] Start check textual model")
self.onnx_checker(self.textual_path)
if verbose:
print("[CLIP ONNX] Models converts successfully")
================================================
FILE: clip_onnx/clip_onnx.py
================================================
from .clip_converter import clip_converter
import torch
import onnxruntime
class clip_onnx(clip_converter):
def __init__(self, model=None,
visual_path: str = "clip_visual.onnx",
textual_path: str = "clip_textual.onnx"):
if not isinstance(model, (type(None))):
super().__init__(model, visual_path, textual_path)
else:
print("[CLIP ONNX] Load mode")
def load_onnx(self, visual_path=None, textual_path=None, logit_scale=None):
if visual_path and textual_path:
if not logit_scale:
raise Exception("For this mode logit_scale must be specified. Example: model.logit_scale.exp()")
self.logit_scale = logit_scale
if visual_path:
self.visual_path = visual_path
self.visual_flag = True
if textual_path:
self.textual_path = textual_path
self.textual_flag = True
def start_sessions(self, providers=['TensorrtExecutionProvider',
'CUDAExecutionProvider',
'CPUExecutionProvider']):
if self.visual_flag:
self.visual_session = onnxruntime.InferenceSession(self.visual_path,
providers=providers)
if self.textual_flag:
self.textual_session = onnxruntime.InferenceSession(self.textual_path,
providers=providers)
def visual_run(self, onnx_image):
onnx_input_image = {self.visual_session.get_inputs()[0].name: onnx_image}
visual_output, = self.visual_session.run(None, onnx_input_image)
return visual_output
def textual_run(self, onnx_text):
onnx_input_text = {self.textual_session.get_inputs()[0].name: onnx_text}
textual_output, = self.textual_session.run(None, onnx_input_text)
return textual_output
def __call__(self, image, text, device: str = "cpu"):
assert self.visual_flag and self.textual_flag
image_features = torch.from_numpy(self.visual_run(image)).to(device)
text_features = torch.from_numpy(self.textual_run(text)).to(device)
# normalized features
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
# cosine similarity as logits
logits_per_image = self.logit_scale * image_features @ text_features.t()
logits_per_text = logits_per_image.t()
# shape = [global_batch_size, global_batch_size]
return logits_per_image, logits_per_text
def encode_image(self, image):
return self.visual_run(image)
def encode_text(self, text):
return self.textual_run(text)
================================================
FILE: clip_onnx/utils.py
================================================
import torch.nn.functional as F
import torch
from torch import nn
class Textual(nn.Module):
def __init__(self, model):
super().__init__()
self.transformer = model.transformer
self.positional_embedding = model.positional_embedding
self.transformer = model.transformer
self.ln_final = model.ln_final
self.text_projection = model.text_projection
self.token_embedding = model.token_embedding
def forward(self, text):
x = self.token_embedding(text) # [batch_size, n_ctx, d_model]
x = x + self.positional_embedding
x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD
x = self.ln_final(x)
# x.shape = [batch_size, n_ctx, transformer.width]
# take features from the eot embedding (eot_token is the highest number in each sequence)
# needs .float() before .argmax( ) to work
x = x[torch.arange(x.shape[0]), text.float().argmax(dim=-1)] @ self.text_projection
return x
def attention(self, x: torch.Tensor):
# onnx doesn't like multi_head_attention_forward so this is a reimplementation
self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
q, k, v = (torch.einsum("tbh, oh -> tbo", x, self.attn.in_proj_weight) + self.attn.in_proj_bias).contiguous().chunk(
3, dim=-1)
tgt_len = q.shape[0]
bsz = q.shape[1]
num_heads = self.attn.num_heads
head_dim = q.shape[2] // num_heads
attn_output = scaled_dot_product_attention(
q.reshape(tgt_len, bsz * num_heads, head_dim).transpose(0, 1),
k.reshape(tgt_len, bsz * num_heads, head_dim).transpose(0, 1),
v.reshape(tgt_len, bsz * num_heads, head_dim).transpose(0, 1), self.attn_mask, 0.0
)
attn_output = attn_output.transpose(0, 1).contiguous().view(q.shape)
attn_output = F.linear(attn_output, self.attn.out_proj.weight, self.attn.out_proj.bias)
return attn_output
def scaled_dot_product_attention(Q, K, V, attn_mask, dropout_p):
if attn_mask is None:
attn_weight = torch.softmax(Q @ K.transpose(-2, -1) / Q.size(-1)**0.5, dim=-1)
else:
attn_weight = torch.softmax(Q @ K.transpose(-2, -1) / Q.size(-1)**0.5 + attn_mask[None, ...], dim=-1)
# attn_weight = torch.dropout(attn_weight, dropout_p) # this is always 0.0 in CLIP so I comment it out.
return attn_weight @ V
DEFAULT_EXPORT = dict(input_names=['input'], output_names=['output'],
export_params=True, verbose=False, opset_version=12,
do_constant_folding=True,
dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}})
================================================
FILE: examples/RuCLIP_onnx_example.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "RuCLIP_onnx_example.ipynb",
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "code",
"source": [
"#@title Allowed Resources\n",
"import multiprocessing\n",
"import torch\n",
"from psutil import virtual_memory\n",
"\n",
"ram_gb = round(virtual_memory().total / 1024**3, 1)\n",
"\n",
"print('CPU:', multiprocessing.cpu_count())\n",
"print('RAM GB:', ram_gb)\n",
"print(\"PyTorch version:\", torch.__version__)\n",
"print(\"CUDA version:\", torch.version.cuda)\n",
"print(\"cuDNN version:\", torch.backends.cudnn.version())\n",
"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
"print(\"device:\", device.type)\n",
"\n",
"!nvidia-smi"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"cellView": "form",
"id": "4gfq46gnYcnU",
"outputId": "41e2054a-e2e4-4bb5-ed39-8bd8bfc639c3"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"CPU: 2\n",
"RAM GB: 12.7\n",
"PyTorch version: 1.10.0+cu111\n",
"CUDA version: 11.1\n",
"cuDNN version: 8005\n",
"device: cuda\n",
"Wed Jan 19 22:10:10 2022 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 495.46 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 41C P8 9W / 70W | 3MiB / 15109MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## Restart colab session after installation\n",
"Reload the session if something doesn't work"
],
"metadata": {
"id": "whlsBiJgR8le"
}
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!pip install git+https://github.com/Lednik7/CLIP-ONNX.git\n",
"!pip install ruclip==0.0.1rc7\n",
"!pip install onnxruntime-gpu"
],
"metadata": {
"id": "HnbpAkvuR73L"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true"
],
"metadata": {
"id": "tqy0zKM4R-7M"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import onnxruntime\n",
"\n",
"# priority device (if available)\n",
"print(onnxruntime.get_device())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "x8IN72OnSAIh",
"outputId": "3174cf2c-ace3-4e1f-a550-e16c72302d51"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"GPU\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## RuCLIP\n",
"WARNING: specific RuCLIP like forward \"model(text, image)\" instead of classic(OpenAI CLIP) \"model(image, text)\""
],
"metadata": {
"id": "8_wSsSheT5mw"
}
},
{
"cell_type": "code",
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\", category=UserWarning)"
],
"metadata": {
"id": "gZTxanR26knr"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import ruclip\n",
"\n",
"# onnx cannot export with cuda\n",
"model, processor = ruclip.load(\"ruclip-vit-base-patch32-384\", device=\"cpu\")"
],
"metadata": {
"id": "FdTLuqsJUBFY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from PIL import Image\n",
"import numpy as np\n",
"\n",
"# simple input\n",
"pil_images = [Image.open(\"CLIP.png\")]\n",
"labels = ['диаграмма', 'собака', 'кошка']\n",
"dummy_input = processor(text=labels, images=pil_images,\n",
" return_tensors='pt', padding=True)\n",
"\n",
"# batch first\n",
"image = dummy_input[\"pixel_values\"] # torch tensor [1, 3, 384, 384]\n",
"image_onnx = dummy_input[\"pixel_values\"].cpu().detach().numpy().astype(np.float32)\n",
"\n",
"# batch first\n",
"text = dummy_input[\"input_ids\"] # torch tensor [3, 77]\n",
"text_onnx = dummy_input[\"input_ids\"].cpu().detach().numpy()[::-1].astype(np.int64)"
],
"metadata": {
"id": "rPwc6A2SSGyl"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#RuCLIP output\n",
"logits_per_image, logits_per_text = model(text, image)\n",
"probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()\n",
"\n",
"print(\"Label probs:\", probs) # prints: [[0.9885839 0.00894288 0.0024732 ]]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pv0mH626SdzO",
"outputId": "d563462f-b2a9-4d49-b491-17e88ffa81f0"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Label probs: [[0.9885839 0.00894288 0.0024732 ]]\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## Convert RuCLIP model to ONNX"
],
"metadata": {
"id": "R_e5OjJeXRiF"
}
},
{
"cell_type": "code",
"source": [
"from clip_onnx import clip_onnx\n",
"\n",
"visual_path = \"clip_visual.onnx\"\n",
"textual_path = \"clip_textual.onnx\"\n",
"\n",
"onnx_model = clip_onnx(model, visual_path=visual_path, textual_path=textual_path)\n",
"onnx_model.convert2onnx(image, text, verbose=True)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oYM5FDSGSJBW",
"outputId": "c647dc2e-946d-4769-c66e-77edfa98237f"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start convert visual model\n",
"[CLIP ONNX] Start check visual model\n",
"[CLIP ONNX] Start convert textual model\n",
"[CLIP ONNX] Start check textual model\n",
"[CLIP ONNX] Models converts successfully\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## [ONNX] CPU inference mode"
],
"metadata": {
"id": "U1Pr-YTtSEhs"
}
},
{
"cell_type": "code",
"source": [
"# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n",
"onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # cpu mode"
],
"metadata": {
"id": "aY9wRe5kT3wG"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"image_features = onnx_model.encode_image(image_onnx)\n",
"text_features = onnx_model.encode_text(text_onnx)\n",
"\n",
"logits_per_image, logits_per_text = onnx_model(image_onnx, text_onnx)\n",
"probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()\n",
"\n",
"print(\"Label probs:\", probs) # prints: Label probs: [[0.90831375 0.07174418 0.01994203]]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tYVuk72nSLw6",
"outputId": "75bf3803-6ed7-4516-ccd0-42f9cf7f22e0"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Label probs: [[0.90831375 0.07174418 0.01994203]]\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%timeit onnx_model.encode_text(text_onnx) # text representation"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Bpu4_HFRVeNk",
"outputId": "e8f1681b-40dc-495f-d382-f0348d87c412"
},
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"1 loop, best of 5: 285 ms per loop\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%timeit onnx_model.encode_image(image_onnx) # image representation"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "JsOccP2gVmpo",
"outputId": "adb33860-b000-461b-959f-95126e2ac049"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"1 loop, best of 5: 412 ms per loop\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## [ONNX] GPU inference mode"
],
"metadata": {
"id": "Zww0E-jIULug"
}
},
{
"cell_type": "code",
"source": [
"onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # cuda mode"
],
"metadata": {
"id": "PBakYeiQUOAm"
},
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%timeit onnx_model.encode_text(text_onnx) # text representation"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EjvRBvCaWJBL",
"outputId": "07426652-1cc5-4713-c355-fb4f1bd138d4"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"The slowest run took 5.07 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
"100 loops, best of 5: 6.89 ms per loop\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%timeit onnx_model.encode_image(image_onnx) # image representation"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pmu4mQCsWJ8w",
"outputId": "5cb45026-dfd3-419d-e5d3-f5d0d9681cd0"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"The slowest run took 699.84 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
"1 loop, best of 5: 18.9 ms per loop\n"
]
}
]
}
]
}
================================================
FILE: examples/clip_onnx_example.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "clip_onnx_example.ipynb",
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"source": [
"## Restart colab session after installation\n",
"Reload the session if something doesn't work"
],
"metadata": {
"id": "fxPg_VvZuScV"
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "al_QNjyFq6Jj"
},
"outputs": [],
"source": [
"%%capture\n",
"!pip install git+https://github.com/Lednik7/CLIP-ONNX.git\n",
"!pip install git+https://github.com/openai/CLIP.git\n",
"!pip install onnxruntime-gpu"
]
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true"
],
"metadata": {
"id": "42eeJz9lTdJ6"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!nvidia-smi"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XuauIZIBSEUX",
"outputId": "2c7c2bd9-90dd-4b1a-e98a-79e1f2218644"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Thu Jan 6 16:36:44 2022 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 495.44 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 35C P8 26W / 149W | 0MiB / 11441MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import onnxruntime\n",
"print(onnxruntime.get_device())"
],
"metadata": {
"id": "gqvxpdajRX5_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## CPU inference mode"
],
"metadata": {
"id": "010k-ksVTjAu"
}
},
{
"cell_type": "markdown",
"source": [
"### Torch CLIP"
],
"metadata": {
"id": "KdTz0IJWVBqE"
}
},
{
"cell_type": "code",
"source": [
"import clip\n",
"from PIL import Image\n",
"import numpy as np\n",
"\n",
"# onnx cannot work with cuda\n",
"model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n",
"\n",
"# batch first\n",
"image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0).cpu() # [1, 3, 224, 224]\n",
"image_onnx = image.detach().cpu().numpy().astype(np.float32)\n",
"\n",
"# batch first\n",
"text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]).cpu() # [3, 77]\n",
"text_onnx = text.detach().cpu().numpy().astype(np.int64)"
],
"metadata": {
"id": "9ROPwKYurOhP"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%timeit model(image, text)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1CrHQ8cYt8Cx",
"outputId": "4d98f85d-4b02-4ae2-b18f-fb3c7a2d6caf"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"1 loop, best of 5: 636 ms per loop\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"### CLIP-ONNX"
],
"metadata": {
"id": "Ao2MriaVVG6Y"
}
},
{
"cell_type": "code",
"source": [
"from clip_onnx import clip_onnx, attention\n",
"clip.model.ResidualAttentionBlock.attention = attention\n",
"\n",
"onnx_model = clip_onnx(model)\n",
"onnx_model.convert2onnx(image, text, verbose=True)\n",
"# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n",
"onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # cpu mode"
],
"metadata": {
"id": "nSeG9uAZrcph",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "8c394684-d78e-49f6-a60f-872485d5f650"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start convert visual model\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/clip_onnx/utils.py:40: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n",
" head_dim = q.shape[2] // num_heads\n",
"/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_helper.py:716: UserWarning: allowzero=0 by default. In order to honor zero value in shape use allowzero=1\n",
" warnings.warn(\"allowzero=0 by default. In order to honor zero value in shape use allowzero=1\")\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start check visual model\n",
"[CLIP ONNX] Start convert textual model\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_opset9.py:2819: UserWarning: Exporting aten::index operator of advanced indexing in opset 14 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n",
" \"If indices include negative values, the exported graph will produce incorrect results.\")\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start check textual model\n",
"[CLIP ONNX] Models converts successfully\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%timeit onnx_model(image_onnx, text_onnx)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "B15dr51UrvMh",
"outputId": "7c5fbc64-61f5-4742-d5a1-24d123971515"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"1 loop, best of 5: 550 ms per loop\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## GPU inference mode\n",
"Select a runtime GPU to continue:\n",
"\n",
"Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU"
],
"metadata": {
"id": "Ahh_7CkTUb8y"
}
},
{
"cell_type": "markdown",
"source": [
"### CLIP-ONNX"
],
"metadata": {
"id": "B6M7yq7qceb5"
}
},
{
"cell_type": "code",
"source": [
"onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # GPU mode"
],
"metadata": {
"id": "6LtPSZhfUd_m"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"onnx_model.visual_session.get_providers() # optional"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xE0VGt9sQwrf",
"outputId": "6feb4701-7b7f-437e-dc2f-c95c504dbb89"
},
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['CUDAExecutionProvider', 'CPUExecutionProvider']"
]
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"source": [
"%timeit onnx_model(image_onnx, text_onnx)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "iPUVzqmgcYas",
"outputId": "3e7c1526-6e38-4982-ca36-eabfc95c2ab9"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"The slowest run took 79.70 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
"1 loop, best of 5: 60.8 ms per loop\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"### Torch CLIP"
],
"metadata": {
"id": "jb58mrkbch2V"
}
},
{
"cell_type": "code",
"source": [
"import clip\n",
"from PIL import Image\n",
"\n",
"device = \"cuda\"\n",
"# onnx cannot work with cuda\n",
"model, preprocess = clip.load(\"ViT-B/32\", device=device, jit=False)\n",
"# batch first\n",
"image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0).to(device) # [1, 3, 224, 224]\n",
"text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]).to(device) # [3, 77]"
],
"metadata": {
"id": "gidR99GOckyF"
},
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%timeit model(image, text)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XpBrtjlOcwOC",
"outputId": "56375401-18a0-499b-f29b-c6e2d4d07e42"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"10 loops, best of 5: 72.2 ms per loop\n"
]
}
]
}
]
}
================================================
FILE: examples/dev/clip_onnx_benchmark_cpu.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "clip-onnx-benchmark-cpu.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyNUvpypuYYk54s1lZecP8Pf",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"source": [
"## Restart colab session after installation\n",
"Reload the session if something doesn't work"
],
"metadata": {
"id": "fxPg_VvZuScV"
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "al_QNjyFq6Jj"
},
"outputs": [],
"source": [
"%%capture\n",
"!pip install git+https://github.com/Lednik7/CLIP-ONNX.git@dev\n",
"!pip install git+https://github.com/openai/CLIP.git\n",
"!pip install onnxruntime-gpu"
]
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true"
],
"metadata": {
"id": "42eeJz9lTdJ6"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!nvidia-smi"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XuauIZIBSEUX",
"outputId": "7e3fa9a5-2970-4bc1-81e5-9ec997a267a1"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Tue May 3 06:56:57 2022 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 47C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import onnxruntime\n",
"print(onnxruntime.get_device())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gqvxpdajRX5_",
"outputId": "4ad23904-186a-4e19-af9a-66538a70a3c8"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"GPU\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## GPU inference mode\n",
"Select a runtime GPU to continue:\n",
"\n",
"Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU"
],
"metadata": {
"id": "010k-ksVTjAu"
}
},
{
"cell_type": "markdown",
"source": [
"### Torch CLIP"
],
"metadata": {
"id": "KdTz0IJWVBqE"
}
},
{
"cell_type": "code",
"source": [
"import clip\n",
"from PIL import Image\n",
"import numpy as np\n",
"\n",
"# onnx cannot work with cuda\n",
"model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n",
"\n",
"# batch first\n",
"image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0) # [1, 3, 224, 224]\n",
"image_onnx = image.detach().cpu().numpy().astype(np.float32)\n",
"\n",
"# batch first\n",
"text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]) # [3, 77]\n",
"text_onnx = text.detach().cpu().numpy().astype(np.int32)"
],
"metadata": {
"id": "9ROPwKYurOhP"
},
"execution_count": 4,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### CLIP-ONNX"
],
"metadata": {
"id": "Ao2MriaVVG6Y"
}
},
{
"cell_type": "code",
"source": [
"from clip_onnx import clip_onnx\n",
"\n",
"onnx_model = clip_onnx(model)\n",
"onnx_model.convert2onnx(image, text, verbose=True)\n",
"# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n",
"onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # GPU mode"
],
"metadata": {
"id": "nSeG9uAZrcph",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "32e7fb6e-191a-4c3a-a8be-42ddf41ee62d"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start convert visual model\n",
"[CLIP ONNX] Start check visual model\n",
"[CLIP ONNX] Start convert textual model\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_opset9.py:2909: UserWarning: Exporting aten::index operator of advanced indexing in opset 12 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n",
" \"If indices include negative values, the exported graph will produce incorrect results.\")\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start check textual model\n",
"[CLIP ONNX] Models converts successfully\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"onnx_model = clip_onnx(model)\n",
"onnx_model.load_onnx(\"/content/clip_visual.onnx\",\n",
" \"/content/clip_textual.onnx\",\n",
" model.logit_scale.exp())\n",
"onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # GPU mode"
],
"metadata": {
"id": "PsDS7ty79zZf"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"source": [
"onnx_model.visual_session.get_providers()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aZsGJNrbNCYe",
"outputId": "27eec69c-6535-46e1-d98a-15836459149e"
},
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['CPUExecutionProvider']"
]
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "markdown",
"source": [
"## Benchmark"
],
"metadata": {
"id": "J5IcOG_6jAFz"
}
},
{
"cell_type": "code",
"source": [
"model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)"
],
"metadata": {
"id": "SJ_5_x7vLepK"
},
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model.eval()\n",
"for x in model.parameters():\n",
" x.requires_grad = False"
],
"metadata": {
"id": "OnOzZ3LMuubW"
},
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import numpy, random, torch"
],
"metadata": {
"id": "wDwqRRrTGKUS"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def set_seed():\n",
" torch.manual_seed(12)\n",
" torch.cuda.manual_seed(12)\n",
" np.random.seed(12)\n",
" random.seed(12)\n",
"\n",
" torch.backends.cudnn.deterministic=True"
],
"metadata": {
"id": "9H17n_6gGJgT"
},
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import torch\n",
"import time\n",
"\n",
"n = 5\n",
"clip_results = {\"encode_image\": [],\n",
" \"encode_text\": []}\n",
"onnx_results = {\"encode_image\": [],\n",
" \"encode_text\": []}\n",
"for batch in [2, 8, 16, 32, 64]:\n",
" set_seed()\n",
" t_mean = []\n",
" for _ in range(n):\n",
" image_input = torch.randint(1, 255, (batch, 3, 224, 224))\n",
" image_input_onnx = image_input.detach().cpu().numpy().astype(np.float32)\n",
" t = time.time()\n",
" onnx_model.encode_image(image_input_onnx)\n",
" t_mean.append(time.time() - t)\n",
" print(\"onnx\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" onnx_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" set_seed()\n",
" with torch.inference_mode():\n",
" t_mean = []\n",
" for _ in range(n):\n",
" image_input = torch.randint(1, 255, (batch, 3, 224, 224))\n",
" t = time.time()\n",
" model.encode_image(image_input)\n",
" t_mean.append(time.time() - t)\n",
" print(\"torch\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" clip_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" set_seed()\n",
" t_mean = []\n",
" for _ in range(n):\n",
" text_input = torch.randint(320, 49407, (batch, 77))\n",
" text_input_onnx = text_input.detach().cpu().numpy().astype(np.int32)\n",
" t = time.time()\n",
" onnx_model.encode_text(text_input_onnx)\n",
" t_mean.append(time.time() - t)\n",
" print(\"onnx\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" onnx_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" set_seed()\n",
" with torch.inference_mode():\n",
" t_mean = []\n",
" for _ in range(n):\n",
" text_input = torch.randint(320, 49407, (batch, 77))\n",
" t = time.time()\n",
" model.encode_text(text_input)\n",
" t_mean.append(time.time() - t)\n",
" print(\"torch\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" clip_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" print(\"-\" * 78)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4lFL6tzWjiWL",
"outputId": "45819718-619e-429c-9aa4-7e28b068b9a3"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"onnx 2 encode_image 0.234\n",
"torch 2 encode_image 0.343\n",
"onnx 2 encode_text 0.162\n",
"torch 2 encode_text 0.243\n",
"------------------------------------------------------------------------------\n",
"onnx 8 encode_image 0.923\n",
"torch 8 encode_image 1.093\n",
"onnx 8 encode_text 0.656\n",
"torch 8 encode_text 0.831\n",
"------------------------------------------------------------------------------\n",
"onnx 16 encode_image 2.079\n",
"torch 16 encode_image 1.952\n",
"onnx 16 encode_text 1.288\n",
"torch 16 encode_text 1.523\n",
"------------------------------------------------------------------------------\n",
"onnx 32 encode_image 3.937\n",
"torch 32 encode_image 4.079\n",
"onnx 32 encode_text 2.658\n",
"torch 32 encode_text 3.015\n",
"------------------------------------------------------------------------------\n",
"onnx 64 encode_image 7.944\n",
"torch 64 encode_image 8.07\n",
"onnx 64 encode_text 5.567\n",
"torch 64 encode_text 6.212\n",
"------------------------------------------------------------------------------\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd"
],
"metadata": {
"id": "P2YhbE9v_4ci"
},
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pd.DataFrame({\"backend\": [\"onnx\", \"torch\"] * 5,\n",
" \"batch\": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],\n",
" \"encode_image\": [j[1] for i in zip(onnx_results[\"encode_image\"],\n",
" clip_results[\"encode_image\"]) for j in i],\n",
" \"encode_text\": [j[1] for i in zip(onnx_results[\"encode_text\"],\n",
" clip_results[\"encode_text\"]) for j in i]})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 362
},
"id": "WfZfDk4PAlqm",
"outputId": "38710ad6-09ae-4c48-fc20-1cdabf4c2a50"
},
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" backend batch encode_image encode_text\n",
"0 onnx 2 0.234 0.162\n",
"1 torch 2 0.343 0.243\n",
"2 onnx 8 0.923 0.656\n",
"3 torch 8 1.093 0.831\n",
"4 onnx 16 2.079 1.288\n",
"5 torch 16 1.952 1.523\n",
"6 onnx 32 3.937 2.658\n",
"7 torch 32 4.079 3.015\n",
"8 onnx 64 7.944 5.567\n",
"9 torch 64 8.070 6.212"
],
"text/html": [
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" backend | \n",
" batch | \n",
" encode_image | \n",
" encode_text | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" onnx | \n",
" 2 | \n",
" 0.234 | \n",
" 0.162 | \n",
"
\n",
" \n",
" | 1 | \n",
" torch | \n",
" 2 | \n",
" 0.343 | \n",
" 0.243 | \n",
"
\n",
" \n",
" | 2 | \n",
" onnx | \n",
" 8 | \n",
" 0.923 | \n",
" 0.656 | \n",
"
\n",
" \n",
" | 3 | \n",
" torch | \n",
" 8 | \n",
" 1.093 | \n",
" 0.831 | \n",
"
\n",
" \n",
" | 4 | \n",
" onnx | \n",
" 16 | \n",
" 2.079 | \n",
" 1.288 | \n",
"
\n",
" \n",
" | 5 | \n",
" torch | \n",
" 16 | \n",
" 1.952 | \n",
" 1.523 | \n",
"
\n",
" \n",
" | 6 | \n",
" onnx | \n",
" 32 | \n",
" 3.937 | \n",
" 2.658 | \n",
"
\n",
" \n",
" | 7 | \n",
" torch | \n",
" 32 | \n",
" 4.079 | \n",
" 3.015 | \n",
"
\n",
" \n",
" | 8 | \n",
" onnx | \n",
" 64 | \n",
" 7.944 | \n",
" 5.567 | \n",
"
\n",
" \n",
" | 9 | \n",
" torch | \n",
" 64 | \n",
" 8.070 | \n",
" 6.212 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "code",
"source": [
"onnx_df = pd.DataFrame({\"ONNX\": [\"ViT-B/32\"] * 5,\n",
" \"batch\": [2, 8, 16, 32, 64],\n",
" \"encode_image\": [i[1] for i in onnx_results[\"encode_image\"]],\n",
" \"encode_text\": [i[1] for i in onnx_results[\"encode_text\"]]})\n",
"onnx_df[\"total\"] = onnx_df[\"encode_image\"] + onnx_df[\"encode_text\"]"
],
"metadata": {
"id": "Xpw9lV7yBbA8"
},
"execution_count": 16,
"outputs": []
},
{
"cell_type": "code",
"source": [
"onnx_df"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "LItAyQkeDhnQ",
"outputId": "37517a71-baf3-494c-8a46-9f05cbfb7d32"
},
"execution_count": 17,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" ONNX batch encode_image encode_text total\n",
"0 ViT-B/32 2 0.234 0.162 0.396\n",
"1 ViT-B/32 8 0.923 0.656 1.579\n",
"2 ViT-B/32 16 2.079 1.288 3.367\n",
"3 ViT-B/32 32 3.937 2.658 6.595\n",
"4 ViT-B/32 64 7.944 5.567 13.511"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ONNX | \n",
" batch | \n",
" encode_image | \n",
" encode_text | \n",
" total | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" ViT-B/32 | \n",
" 2 | \n",
" 0.234 | \n",
" 0.162 | \n",
" 0.396 | \n",
"
\n",
" \n",
" | 1 | \n",
" ViT-B/32 | \n",
" 8 | \n",
" 0.923 | \n",
" 0.656 | \n",
" 1.579 | \n",
"
\n",
" \n",
" | 2 | \n",
" ViT-B/32 | \n",
" 16 | \n",
" 2.079 | \n",
" 1.288 | \n",
" 3.367 | \n",
"
\n",
" \n",
" | 3 | \n",
" ViT-B/32 | \n",
" 32 | \n",
" 3.937 | \n",
" 2.658 | \n",
" 6.595 | \n",
"
\n",
" \n",
" | 4 | \n",
" ViT-B/32 | \n",
" 64 | \n",
" 7.944 | \n",
" 5.567 | \n",
" 13.511 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 17
}
]
},
{
"cell_type": "code",
"source": [
"print(onnx_df.to_markdown(index=False))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "AIQDA9FaJZ7Y",
"outputId": "8e8d4109-822e-4328-b2ca-66d4b9a19f8d"
},
"execution_count": 18,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"| ONNX | batch | encode_image | encode_text | total |\n",
"|:---------|--------:|---------------:|--------------:|--------:|\n",
"| ViT-B/32 | 2 | 0.234 | 0.162 | 0.396 |\n",
"| ViT-B/32 | 8 | 0.923 | 0.656 | 1.579 |\n",
"| ViT-B/32 | 16 | 2.079 | 1.288 | 3.367 |\n",
"| ViT-B/32 | 32 | 3.937 | 2.658 | 6.595 |\n",
"| ViT-B/32 | 64 | 7.944 | 5.567 | 13.511 |\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"clip_df = pd.DataFrame({\"TORCH\": [\"ViT-B/32\"] * 5,\n",
" \"batch\": [2, 8, 16, 32, 64],\n",
" \"encode_image\": [i[1] for i in clip_results[\"encode_image\"]],\n",
" \"encode_text\": [i[1] for i in clip_results[\"encode_text\"]]})\n",
"clip_df[\"total\"] = clip_df[\"encode_image\"] + clip_df[\"encode_text\"]"
],
"metadata": {
"id": "E1OXQUDvDZmI"
},
"execution_count": 19,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(clip_df.to_markdown(index=False))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xAj-ynhCDpPO",
"outputId": "88243c7f-bd6d-4a63-9ee2-154440c3df7e"
},
"execution_count": 20,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"| TORCH | batch | encode_image | encode_text | total |\n",
"|:---------|--------:|---------------:|--------------:|--------:|\n",
"| ViT-B/32 | 2 | 0.343 | 0.243 | 0.586 |\n",
"| ViT-B/32 | 8 | 1.093 | 0.831 | 1.924 |\n",
"| ViT-B/32 | 16 | 1.952 | 1.523 | 3.475 |\n",
"| ViT-B/32 | 32 | 4.079 | 3.015 | 7.094 |\n",
"| ViT-B/32 | 64 | 8.07 | 6.212 | 14.282 |\n"
]
}
]
}
]
}
================================================
FILE: examples/dev/clip_onnx_benchmark_gpu.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "clip-onnx-benchmark-gpu.ipynb",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"source": [
"## Restart colab session after installation\n",
"Reload the session if something doesn't work"
],
"metadata": {
"id": "fxPg_VvZuScV"
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "al_QNjyFq6Jj"
},
"outputs": [],
"source": [
"%%capture\n",
"!pip install git+https://github.com/Lednik7/CLIP-ONNX.git\n",
"!pip install git+https://github.com/openai/CLIP.git\n",
"!pip install onnxruntime-gpu"
]
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true"
],
"metadata": {
"id": "42eeJz9lTdJ6"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!nvidia-smi"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XuauIZIBSEUX",
"outputId": "7e2b352b-751e-439e-bb3d-4e1323e2e44d"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Thu Jan 6 15:47:04 2022 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 495.44 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 34C P8 28W / 149W | 0MiB / 11441MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import onnxruntime\n",
"print(onnxruntime.get_device())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gqvxpdajRX5_",
"outputId": "7c44b4e1-d916-42d9-cc61-52efdf0fa9a9"
},
"execution_count": 20,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"GPU\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## GPU inference mode\n",
"Select a runtime GPU to continue:\n",
"\n",
"Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU"
],
"metadata": {
"id": "010k-ksVTjAu"
}
},
{
"cell_type": "markdown",
"source": [
"### Torch CLIP"
],
"metadata": {
"id": "KdTz0IJWVBqE"
}
},
{
"cell_type": "code",
"source": [
"import clip\n",
"from PIL import Image\n",
"import numpy as np\n",
"\n",
"# onnx cannot work with cuda\n",
"model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n",
"\n",
"# batch first\n",
"image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0) # [1, 3, 224, 224]\n",
"image_onnx = image.detach().cpu().numpy().astype(np.float32)\n",
"\n",
"# batch first\n",
"text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]) # [3, 77]\n",
"text_onnx = text.detach().cpu().numpy().astype(np.int64)"
],
"metadata": {
"id": "9ROPwKYurOhP"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### CLIP-ONNX"
],
"metadata": {
"id": "Ao2MriaVVG6Y"
}
},
{
"cell_type": "code",
"source": [
"from clip_onnx import clip_onnx, attention\n",
"clip.model.ResidualAttentionBlock.attention = attention\n",
"\n",
"onnx_model = clip_onnx(model)\n",
"onnx_model.convert2onnx(image, text, verbose=False)\n",
"# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n",
"onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # GPU mode"
],
"metadata": {
"id": "nSeG9uAZrcph",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "25e07d68-6ef2-44c4-d144-c43b611f3316"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/clip_onnx/utils.py:40: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').\n",
" head_dim = q.shape[2] // num_heads\n",
"/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_helper.py:716: UserWarning: allowzero=0 by default. In order to honor zero value in shape use allowzero=1\n",
" warnings.warn(\"allowzero=0 by default. In order to honor zero value in shape use allowzero=1\")\n",
"/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_opset9.py:2819: UserWarning: Exporting aten::index operator of advanced indexing in opset 14 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n",
" \"If indices include negative values, the exported graph will produce incorrect results.\")\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from clip_onnx import clip_onnx, attention\n",
"clip.model.ResidualAttentionBlock.attention = attention"
],
"metadata": {
"id": "imMVbHFO-KSH"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"onnx_model = clip_onnx(model)\n",
"onnx_model.load_onnx(\"/content/clip_visual.onnx\",\n",
" \"/content/clip_textual.onnx\",\n",
" model.logit_scale.exp())\n",
"onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # GPU mode"
],
"metadata": {
"id": "PsDS7ty79zZf"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"onnx_model.visual_session.get_providers()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aZsGJNrbNCYe",
"outputId": "9dcdd2d6-2a73-4dad-9ea7-c2892273c631"
},
"execution_count": 4,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['CUDAExecutionProvider', 'CPUExecutionProvider']"
]
},
"metadata": {},
"execution_count": 4
}
]
},
{
"cell_type": "markdown",
"source": [
"## Benchmark"
],
"metadata": {
"id": "J5IcOG_6jAFz"
}
},
{
"cell_type": "code",
"source": [
"model, preprocess = clip.load(\"ViT-B/32\", device=\"cuda\", jit=False)"
],
"metadata": {
"id": "SJ_5_x7vLepK"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model.eval()\n",
"for x in model.parameters():\n",
" x.requires_grad = False"
],
"metadata": {
"id": "OnOzZ3LMuubW"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import numpy, random, torch"
],
"metadata": {
"id": "wDwqRRrTGKUS"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def set_seed():\n",
" torch.manual_seed(12)\n",
" torch.cuda.manual_seed(12)\n",
" np.random.seed(12)\n",
" random.seed(12)\n",
"\n",
" torch.backends.cudnn.deterministic=True"
],
"metadata": {
"id": "9H17n_6gGJgT"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%timeit onnx_model.encode_image(image_onnx)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "IsJ2TsBRNh8f",
"outputId": "bb642ee7-0112-4195-be35-14fdf719e7bc"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"The slowest run took 23.27 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
"1 loop, best of 5: 20.1 ms per loop\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import torch\n",
"import time\n",
"\n",
"n = 5\n",
"clip_results = {\"encode_image\": [],\n",
" \"encode_text\": []}\n",
"onnx_results = {\"encode_image\": [],\n",
" \"encode_text\": []}\n",
"for batch in [2, 8, 16, 32, 64]:\n",
" set_seed()\n",
" image_input = torch.randint(1, 255, (batch, 3, 224, 224)).cuda()\n",
" text_input = torch.randint(320, 49407, (batch, 77)).cuda()\n",
" image_input_onnx = image_input.detach().cpu().numpy().astype(np.float32)\n",
" text_input_onnx = text_input.detach().cpu().numpy().astype(np.int64)\n",
"\n",
" t_mean = []\n",
" for _ in range(n):\n",
" t = time.time()\n",
" onnx_model.encode_image(image_input_onnx)\n",
" t_mean.append(time.time() - t)\n",
" print(\"onnx\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" onnx_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" with torch.inference_mode():\n",
" t_mean = []\n",
" for _ in range(n):\n",
" t = time.time()\n",
" model.encode_image(image_input)\n",
" t_mean.append(time.time() - t)\n",
" print(\"torch\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" clip_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" t_mean = []\n",
" for _ in range(n):\n",
" t = time.time()\n",
" onnx_model.encode_text(text_input_onnx)\n",
" t_mean.append(time.time() - t)\n",
" print(\"onnx\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" onnx_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" with torch.inference_mode():\n",
" t_mean = []\n",
" for _ in range(n):\n",
" t = time.time()\n",
" model.encode_text(text_input)\n",
" t_mean.append(time.time() - t)\n",
" print(\"torch\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" clip_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" print(\"-\" * 78)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4lFL6tzWjiWL",
"outputId": "a209b78a-fe78-4b46-9220-4b9624a1568f"
},
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"onnx 2 encode_image 0.073\n",
"torch 2 encode_image 0.041\n",
"onnx 2 encode_text 0.032\n",
"torch 2 encode_text 0.033\n",
"------------------------------------------------------------------------------\n",
"onnx 8 encode_image 0.088\n",
"torch 8 encode_image 0.128\n",
"onnx 8 encode_text 0.052\n",
"torch 8 encode_text 0.102\n",
"------------------------------------------------------------------------------\n",
"onnx 16 encode_image 0.123\n",
"torch 16 encode_image 0.258\n",
"onnx 16 encode_text 0.08\n",
"torch 16 encode_text 0.201\n",
"------------------------------------------------------------------------------\n",
"onnx 32 encode_image 0.196\n",
"torch 32 encode_image 0.505\n",
"onnx 32 encode_text 0.138\n",
"torch 32 encode_text 0.386\n",
"------------------------------------------------------------------------------\n",
"onnx 64 encode_image 0.352\n",
"torch 64 encode_image 0.995\n",
"onnx 64 encode_text 0.252\n",
"torch 64 encode_text 0.754\n",
"------------------------------------------------------------------------------\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd"
],
"metadata": {
"id": "P2YhbE9v_4ci"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pd.DataFrame({\"backend\": [\"onnx\", \"torch\"] * 5,\n",
" \"batch\": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],\n",
" \"encode_image\": [j[1] for i in zip(onnx_results[\"encode_image\"],\n",
" clip_results[\"encode_image\"]) for j in i],\n",
" \"encode_text\": [j[1] for i in zip(onnx_results[\"encode_text\"],\n",
" clip_results[\"encode_text\"]) for j in i]})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 362
},
"id": "WfZfDk4PAlqm",
"outputId": "aa180c38-35f8-403c-a172-4e78266510d5"
},
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" backend | \n",
" batch | \n",
" encode_image | \n",
" encode_text | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" onnx | \n",
" 2 | \n",
" 0.073 | \n",
" 0.032 | \n",
"
\n",
" \n",
" | 1 | \n",
" torch | \n",
" 2 | \n",
" 0.041 | \n",
" 0.033 | \n",
"
\n",
" \n",
" | 2 | \n",
" onnx | \n",
" 8 | \n",
" 0.088 | \n",
" 0.052 | \n",
"
\n",
" \n",
" | 3 | \n",
" torch | \n",
" 8 | \n",
" 0.128 | \n",
" 0.102 | \n",
"
\n",
" \n",
" | 4 | \n",
" onnx | \n",
" 16 | \n",
" 0.123 | \n",
" 0.080 | \n",
"
\n",
" \n",
" | 5 | \n",
" torch | \n",
" 16 | \n",
" 0.258 | \n",
" 0.201 | \n",
"
\n",
" \n",
" | 6 | \n",
" onnx | \n",
" 32 | \n",
" 0.196 | \n",
" 0.138 | \n",
"
\n",
" \n",
" | 7 | \n",
" torch | \n",
" 32 | \n",
" 0.505 | \n",
" 0.386 | \n",
"
\n",
" \n",
" | 8 | \n",
" onnx | \n",
" 64 | \n",
" 0.352 | \n",
" 0.252 | \n",
"
\n",
" \n",
" | 9 | \n",
" torch | \n",
" 64 | \n",
" 0.995 | \n",
" 0.754 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
],
"text/plain": [
" backend batch encode_image encode_text\n",
"0 onnx 2 0.073 0.032\n",
"1 torch 2 0.041 0.033\n",
"2 onnx 8 0.088 0.052\n",
"3 torch 8 0.128 0.102\n",
"4 onnx 16 0.123 0.080\n",
"5 torch 16 0.258 0.201\n",
"6 onnx 32 0.196 0.138\n",
"7 torch 32 0.505 0.386\n",
"8 onnx 64 0.352 0.252\n",
"9 torch 64 0.995 0.754"
]
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"source": [
"onnx_df = pd.DataFrame({\"ONNX\": [\"ViT-B/32\"] * 5,\n",
" \"batch\": [2, 8, 16, 32, 64],\n",
" \"encode_image\": [i[1] for i in onnx_results[\"encode_image\"]],\n",
" \"encode_text\": [i[1] for i in onnx_results[\"encode_text\"]]})\n",
"onnx_df[\"summary\"] = onnx_df[\"encode_image\"] + onnx_df[\"encode_text\"]"
],
"metadata": {
"id": "Xpw9lV7yBbA8"
},
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"source": [
"onnx_df"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "LItAyQkeDhnQ",
"outputId": "ebd84ad1-f305-4578-9164-2884aaa2b245"
},
"execution_count": 14,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ONNX | \n",
" batch | \n",
" encode_image | \n",
" encode_text | \n",
" summary | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" ViT-B/32 | \n",
" 2 | \n",
" 0.073 | \n",
" 0.032 | \n",
" 0.105 | \n",
"
\n",
" \n",
" | 1 | \n",
" ViT-B/32 | \n",
" 8 | \n",
" 0.088 | \n",
" 0.052 | \n",
" 0.140 | \n",
"
\n",
" \n",
" | 2 | \n",
" ViT-B/32 | \n",
" 16 | \n",
" 0.123 | \n",
" 0.080 | \n",
" 0.203 | \n",
"
\n",
" \n",
" | 3 | \n",
" ViT-B/32 | \n",
" 32 | \n",
" 0.196 | \n",
" 0.138 | \n",
" 0.334 | \n",
"
\n",
" \n",
" | 4 | \n",
" ViT-B/32 | \n",
" 64 | \n",
" 0.352 | \n",
" 0.252 | \n",
" 0.604 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
],
"text/plain": [
" ONNX batch encode_image encode_text summary\n",
"0 ViT-B/32 2 0.073 0.032 0.105\n",
"1 ViT-B/32 8 0.088 0.052 0.140\n",
"2 ViT-B/32 16 0.123 0.080 0.203\n",
"3 ViT-B/32 32 0.196 0.138 0.334\n",
"4 ViT-B/32 64 0.352 0.252 0.604"
]
},
"metadata": {},
"execution_count": 14
}
]
},
{
"cell_type": "code",
"source": [
"print(onnx_df.to_markdown(index=False))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "AIQDA9FaJZ7Y",
"outputId": "4fdfd92a-5c8c-43d9-e875-7bcddc882113"
},
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"| ONNX | batch | encode_image | encode_text | summary |\n",
"|:---------|--------:|---------------:|--------------:|----------:|\n",
"| ViT-B/32 | 2 | 0.073 | 0.032 | 0.105 |\n",
"| ViT-B/32 | 8 | 0.088 | 0.052 | 0.14 |\n",
"| ViT-B/32 | 16 | 0.123 | 0.08 | 0.203 |\n",
"| ViT-B/32 | 32 | 0.196 | 0.138 | 0.334 |\n",
"| ViT-B/32 | 64 | 0.352 | 0.252 | 0.604 |\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"clip_df = pd.DataFrame({\"TORCH\": [\"ViT-B/32\"] * 5,\n",
" \"batch\": [2, 8, 16, 32, 64],\n",
" \"encode_image\": [i[1] for i in clip_results[\"encode_image\"]],\n",
" \"encode_text\": [i[1] for i in clip_results[\"encode_text\"]]})\n",
"clip_df[\"summary\"] = clip_df[\"encode_image\"] + clip_df[\"encode_text\"]"
],
"metadata": {
"id": "E1OXQUDvDZmI"
},
"execution_count": 16,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(clip_df.to_markdown(index=False))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xAj-ynhCDpPO",
"outputId": "6a36903d-6bba-4675-8eb3-7f58af98e165"
},
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"| TORCH | batch | encode_image | encode_text | summary |\n",
"|:---------|--------:|---------------:|--------------:|----------:|\n",
"| ViT-B/32 | 2 | 0.041 | 0.033 | 0.074 |\n",
"| ViT-B/32 | 8 | 0.128 | 0.102 | 0.23 |\n",
"| ViT-B/32 | 16 | 0.258 | 0.201 | 0.459 |\n",
"| ViT-B/32 | 32 | 0.505 | 0.386 | 0.891 |\n",
"| ViT-B/32 | 64 | 0.995 | 0.754 | 1.749 |\n"
]
}
]
}
]
}
================================================
FILE: examples/dev/clip_onnx_benchmark_gpu_K80.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "clip-onnx-benchmark-gpu-K80.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyOXxz4T8v9RCW/JZlRRUtl4",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"source": [
"## Restart colab session after installation\n",
"Reload the session if something doesn't work"
],
"metadata": {
"id": "fxPg_VvZuScV"
}
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "al_QNjyFq6Jj"
},
"outputs": [],
"source": [
"%%capture\n",
"!pip install git+https://github.com/Lednik7/CLIP-ONNX.git@dev\n",
"!pip install git+https://github.com/openai/CLIP.git\n",
"!pip install onnxruntime-gpu"
]
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true"
],
"metadata": {
"id": "42eeJz9lTdJ6"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!nvidia-smi"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XuauIZIBSEUX",
"outputId": "3bfb5833-272d-4aa0-f296-edab8122547c"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Tue May 3 07:20:58 2022 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 56C P8 29W / 149W | 0MiB / 11441MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import onnxruntime\n",
"print(onnxruntime.get_device())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gqvxpdajRX5_",
"outputId": "bb8e9195-fe9c-421c-e27b-d76da7136b82"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"GPU\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## GPU inference mode\n",
"Select a runtime GPU to continue:\n",
"\n",
"Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU"
],
"metadata": {
"id": "010k-ksVTjAu"
}
},
{
"cell_type": "markdown",
"source": [
"### Torch CLIP"
],
"metadata": {
"id": "KdTz0IJWVBqE"
}
},
{
"cell_type": "code",
"source": [
"import clip\n",
"from PIL import Image\n",
"import numpy as np\n",
"\n",
"# onnx cannot work with cuda\n",
"model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n",
"\n",
"# batch first\n",
"image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0) # [1, 3, 224, 224]\n",
"image_onnx = image.detach().cpu().numpy().astype(np.float32)\n",
"\n",
"# batch first\n",
"text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]) # [3, 77]\n",
"text_onnx = text.detach().cpu().numpy().astype(np.int32)"
],
"metadata": {
"id": "9ROPwKYurOhP"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### CLIP-ONNX"
],
"metadata": {
"id": "Ao2MriaVVG6Y"
}
},
{
"cell_type": "code",
"source": [
"from clip_onnx import clip_onnx\n",
"from clip_onnx.utils import DEFAULT_EXPORT\n",
"\n",
"DEFAULT_EXPORT[\"opset_version\"] = 15\n",
"\n",
"onnx_model = clip_onnx(model)\n",
"onnx_model.convert2onnx(image, text, verbose=True)\n",
"# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n",
"onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # GPU mode"
],
"metadata": {
"id": "nSeG9uAZrcph",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "1d4a8404-104f-4107-f2c4-e7e1f7b1d104"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start convert visual model\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_helper.py:719: UserWarning: allowzero=0 by default. In order to honor zero value in shape use allowzero=1\n",
" warnings.warn(\"allowzero=0 by default. In order to honor zero value in shape use allowzero=1\")\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start check visual model\n",
"[CLIP ONNX] Start convert textual model\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_opset9.py:2909: UserWarning: Exporting aten::index operator of advanced indexing in opset 15 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n",
" \"If indices include negative values, the exported graph will produce incorrect results.\")\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start check textual model\n",
"[CLIP ONNX] Models converts successfully\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"onnx_model = clip_onnx(model)\n",
"onnx_model.load_onnx(\"/content/clip_visual.onnx\",\n",
" \"/content/clip_textual.onnx\",\n",
" model.logit_scale.exp())\n",
"onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # GPU mode"
],
"metadata": {
"id": "PsDS7ty79zZf"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"onnx_model.visual_session.get_providers()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aZsGJNrbNCYe",
"outputId": "b0ee40a7-2ece-4e88-9e35-9ed0a735c533"
},
"execution_count": 7,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['CUDAExecutionProvider', 'CPUExecutionProvider']"
]
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "markdown",
"source": [
"## Benchmark"
],
"metadata": {
"id": "J5IcOG_6jAFz"
}
},
{
"cell_type": "code",
"source": [
"model, preprocess = clip.load(\"ViT-B/32\", device=\"cuda\", jit=False)"
],
"metadata": {
"id": "SJ_5_x7vLepK"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model.eval()\n",
"for x in model.parameters():\n",
" x.requires_grad = False"
],
"metadata": {
"id": "OnOzZ3LMuubW"
},
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import numpy, random, torch"
],
"metadata": {
"id": "wDwqRRrTGKUS"
},
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def set_seed():\n",
" torch.manual_seed(12)\n",
" torch.cuda.manual_seed(12)\n",
" np.random.seed(12)\n",
" random.seed(12)\n",
"\n",
" torch.backends.cudnn.deterministic=True"
],
"metadata": {
"id": "9H17n_6gGJgT"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import torch\n",
"import time\n",
"\n",
"n = 5\n",
"clip_results = {\"encode_image\": [],\n",
" \"encode_text\": []}\n",
"onnx_results = {\"encode_image\": [],\n",
" \"encode_text\": []}\n",
"for batch in [2, 8, 16, 32, 64]:\n",
" set_seed()\n",
" t_mean = []\n",
" for _ in range(n):\n",
" image_input = torch.randint(1, 255, (batch, 3, 224, 224))\n",
" image_input_onnx = image_input.detach().cpu().numpy().astype(np.float32)\n",
" t = time.time()\n",
" onnx_model.encode_image(image_input_onnx)\n",
" t_mean.append(time.time() - t)\n",
" print(\"onnx\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" onnx_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" set_seed()\n",
" with torch.inference_mode():\n",
" t_mean = []\n",
" for _ in range(n):\n",
" image_input = torch.randint(1, 255, (batch, 3, 224, 224)).cuda()\n",
" t = time.time()\n",
" model.encode_image(image_input)\n",
" t_mean.append(time.time() - t)\n",
" print(\"torch\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" clip_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" set_seed()\n",
" t_mean = []\n",
" for _ in range(n):\n",
" text_input = torch.randint(320, 49407, (batch, 77))\n",
" text_input_onnx = text_input.detach().cpu().numpy().astype(np.int32)\n",
" t = time.time()\n",
" onnx_model.encode_text(text_input_onnx)\n",
" t_mean.append(time.time() - t)\n",
" print(\"onnx\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" onnx_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" set_seed()\n",
" with torch.inference_mode():\n",
" t_mean = []\n",
" for _ in range(n):\n",
" text_input = torch.randint(320, 49407, (batch, 77)).cuda()\n",
" t = time.time()\n",
" model.encode_text(text_input)\n",
" t_mean.append(time.time() - t)\n",
" print(\"torch\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" clip_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" print(\"-\" * 78)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4lFL6tzWjiWL",
"outputId": "ccaa7e0a-96f3-4a51-c4bd-c442aa13763c"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"onnx 2 encode_image 0.136\n",
"torch 2 encode_image 0.02\n",
"onnx 2 encode_text 0.021\n",
"torch 2 encode_text 0.035\n",
"------------------------------------------------------------------------------\n",
"onnx 8 encode_image 0.054\n",
"torch 8 encode_image 0.081\n",
"onnx 8 encode_text 0.04\n",
"torch 8 encode_text 0.098\n",
"------------------------------------------------------------------------------\n",
"onnx 16 encode_image 0.089\n",
"torch 16 encode_image 0.207\n",
"onnx 16 encode_text 0.071\n",
"torch 16 encode_text 0.196\n",
"------------------------------------------------------------------------------\n",
"onnx 32 encode_image 0.158\n",
"torch 32 encode_image 0.44\n",
"onnx 32 encode_text 0.134\n",
"torch 32 encode_text 0.374\n",
"------------------------------------------------------------------------------\n",
"onnx 64 encode_image 0.325\n",
"torch 64 encode_image 0.919\n",
"onnx 64 encode_text 0.258\n",
"torch 64 encode_text 0.719\n",
"------------------------------------------------------------------------------\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd"
],
"metadata": {
"id": "P2YhbE9v_4ci"
},
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pd.DataFrame({\"backend\": [\"onnx\", \"torch\"] * 5,\n",
" \"batch\": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],\n",
" \"encode_image\": [j[1] for i in zip(onnx_results[\"encode_image\"],\n",
" clip_results[\"encode_image\"]) for j in i],\n",
" \"encode_text\": [j[1] for i in zip(onnx_results[\"encode_text\"],\n",
" clip_results[\"encode_text\"]) for j in i]})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 362
},
"id": "WfZfDk4PAlqm",
"outputId": "78a5cae8-68ee-4edd-f34d-ccf7d3d8a23b"
},
"execution_count": 14,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" backend batch encode_image encode_text\n",
"0 onnx 2 0.136 0.021\n",
"1 torch 2 0.020 0.035\n",
"2 onnx 8 0.054 0.040\n",
"3 torch 8 0.081 0.098\n",
"4 onnx 16 0.089 0.071\n",
"5 torch 16 0.207 0.196\n",
"6 onnx 32 0.158 0.134\n",
"7 torch 32 0.440 0.374\n",
"8 onnx 64 0.325 0.258\n",
"9 torch 64 0.919 0.719"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" backend | \n",
" batch | \n",
" encode_image | \n",
" encode_text | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" onnx | \n",
" 2 | \n",
" 0.136 | \n",
" 0.021 | \n",
"
\n",
" \n",
" | 1 | \n",
" torch | \n",
" 2 | \n",
" 0.020 | \n",
" 0.035 | \n",
"
\n",
" \n",
" | 2 | \n",
" onnx | \n",
" 8 | \n",
" 0.054 | \n",
" 0.040 | \n",
"
\n",
" \n",
" | 3 | \n",
" torch | \n",
" 8 | \n",
" 0.081 | \n",
" 0.098 | \n",
"
\n",
" \n",
" | 4 | \n",
" onnx | \n",
" 16 | \n",
" 0.089 | \n",
" 0.071 | \n",
"
\n",
" \n",
" | 5 | \n",
" torch | \n",
" 16 | \n",
" 0.207 | \n",
" 0.196 | \n",
"
\n",
" \n",
" | 6 | \n",
" onnx | \n",
" 32 | \n",
" 0.158 | \n",
" 0.134 | \n",
"
\n",
" \n",
" | 7 | \n",
" torch | \n",
" 32 | \n",
" 0.440 | \n",
" 0.374 | \n",
"
\n",
" \n",
" | 8 | \n",
" onnx | \n",
" 64 | \n",
" 0.325 | \n",
" 0.258 | \n",
"
\n",
" \n",
" | 9 | \n",
" torch | \n",
" 64 | \n",
" 0.919 | \n",
" 0.719 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 14
}
]
},
{
"cell_type": "code",
"source": [
"onnx_df = pd.DataFrame({\"ONNX\": [\"ViT-B/32\"] * 5,\n",
" \"batch\": [2, 8, 16, 32, 64],\n",
" \"encode_image\": [i[1] for i in onnx_results[\"encode_image\"]],\n",
" \"encode_text\": [i[1] for i in onnx_results[\"encode_text\"]]})\n",
"onnx_df[\"total\"] = onnx_df[\"encode_image\"] + onnx_df[\"encode_text\"]"
],
"metadata": {
"id": "Xpw9lV7yBbA8"
},
"execution_count": 15,
"outputs": []
},
{
"cell_type": "code",
"source": [
"onnx_df"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "LItAyQkeDhnQ",
"outputId": "f9c1860c-e405-4d41-e530-d2b0027f1fd0"
},
"execution_count": 16,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" ONNX batch encode_image encode_text total\n",
"0 ViT-B/32 2 0.136 0.021 0.157\n",
"1 ViT-B/32 8 0.054 0.040 0.094\n",
"2 ViT-B/32 16 0.089 0.071 0.160\n",
"3 ViT-B/32 32 0.158 0.134 0.292\n",
"4 ViT-B/32 64 0.325 0.258 0.583"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ONNX | \n",
" batch | \n",
" encode_image | \n",
" encode_text | \n",
" total | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" ViT-B/32 | \n",
" 2 | \n",
" 0.136 | \n",
" 0.021 | \n",
" 0.157 | \n",
"
\n",
" \n",
" | 1 | \n",
" ViT-B/32 | \n",
" 8 | \n",
" 0.054 | \n",
" 0.040 | \n",
" 0.094 | \n",
"
\n",
" \n",
" | 2 | \n",
" ViT-B/32 | \n",
" 16 | \n",
" 0.089 | \n",
" 0.071 | \n",
" 0.160 | \n",
"
\n",
" \n",
" | 3 | \n",
" ViT-B/32 | \n",
" 32 | \n",
" 0.158 | \n",
" 0.134 | \n",
" 0.292 | \n",
"
\n",
" \n",
" | 4 | \n",
" ViT-B/32 | \n",
" 64 | \n",
" 0.325 | \n",
" 0.258 | \n",
" 0.583 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 16
}
]
},
{
"cell_type": "code",
"source": [
"print(onnx_df.to_markdown(index=False))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "AIQDA9FaJZ7Y",
"outputId": "36aa68bb-8ebb-47de-d2b4-b8ce36cacfd7"
},
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"| ONNX | batch | encode_image | encode_text | total |\n",
"|:---------|--------:|---------------:|--------------:|--------:|\n",
"| ViT-B/32 | 2 | 0.136 | 0.021 | 0.157 |\n",
"| ViT-B/32 | 8 | 0.054 | 0.04 | 0.094 |\n",
"| ViT-B/32 | 16 | 0.089 | 0.071 | 0.16 |\n",
"| ViT-B/32 | 32 | 0.158 | 0.134 | 0.292 |\n",
"| ViT-B/32 | 64 | 0.325 | 0.258 | 0.583 |\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"clip_df = pd.DataFrame({\"TORCH\": [\"ViT-B/32\"] * 5,\n",
" \"batch\": [2, 8, 16, 32, 64],\n",
" \"encode_image\": [i[1] for i in clip_results[\"encode_image\"]],\n",
" \"encode_text\": [i[1] for i in clip_results[\"encode_text\"]]})\n",
"clip_df[\"total\"] = clip_df[\"encode_image\"] + clip_df[\"encode_text\"]"
],
"metadata": {
"id": "E1OXQUDvDZmI"
},
"execution_count": 18,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(clip_df.to_markdown(index=False))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xAj-ynhCDpPO",
"outputId": "6f31dab3-8b2a-4b64-ed97-2ac309d6d749"
},
"execution_count": 19,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"| TORCH | batch | encode_image | encode_text | total |\n",
"|:---------|--------:|---------------:|--------------:|--------:|\n",
"| ViT-B/32 | 2 | 0.02 | 0.035 | 0.055 |\n",
"| ViT-B/32 | 8 | 0.081 | 0.098 | 0.179 |\n",
"| ViT-B/32 | 16 | 0.207 | 0.196 | 0.403 |\n",
"| ViT-B/32 | 32 | 0.44 | 0.374 | 0.814 |\n",
"| ViT-B/32 | 64 | 0.919 | 0.719 | 1.638 |\n"
]
}
]
}
]
}
================================================
FILE: examples/dev/clip_onnx_benchmark_gpu_T4.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "clip-onnx-benchmark-gpu-T4.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyNqeHpYdbkhiqZatysOn5ch",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"source": [
"## Restart colab session after installation\n",
"Reload the session if something doesn't work"
],
"metadata": {
"id": "fxPg_VvZuScV"
}
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "al_QNjyFq6Jj"
},
"outputs": [],
"source": [
"%%capture\n",
"!pip install git+https://github.com/Lednik7/CLIP-ONNX.git@dev\n",
"!pip install git+https://github.com/openai/CLIP.git\n",
"!pip install onnxruntime-gpu"
]
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true"
],
"metadata": {
"id": "42eeJz9lTdJ6"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!nvidia-smi"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XuauIZIBSEUX",
"outputId": "3e459c2c-8f31-4aff-c288-f2e6c4684e36"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Tue May 3 07:10:09 2022 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 38C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import onnxruntime\n",
"print(onnxruntime.get_device())"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gqvxpdajRX5_",
"outputId": "48a89abb-a326-4563-f99a-40c7d25145af"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"GPU\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## GPU inference mode\n",
"Select a runtime GPU to continue:\n",
"\n",
"Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU"
],
"metadata": {
"id": "010k-ksVTjAu"
}
},
{
"cell_type": "markdown",
"source": [
"### Torch CLIP"
],
"metadata": {
"id": "KdTz0IJWVBqE"
}
},
{
"cell_type": "code",
"source": [
"import clip\n",
"from PIL import Image\n",
"import numpy as np\n",
"\n",
"# onnx cannot work with cuda\n",
"model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n",
"\n",
"# batch first\n",
"image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0) # [1, 3, 224, 224]\n",
"image_onnx = image.detach().cpu().numpy().astype(np.float32)\n",
"\n",
"# batch first\n",
"text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]) # [3, 77]\n",
"text_onnx = text.detach().cpu().numpy().astype(np.int32)"
],
"metadata": {
"id": "9ROPwKYurOhP"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"### CLIP-ONNX"
],
"metadata": {
"id": "Ao2MriaVVG6Y"
}
},
{
"cell_type": "code",
"source": [
"from clip_onnx import clip_onnx\n",
"\n",
"onnx_model = clip_onnx(model)\n",
"onnx_model.convert2onnx(image, text, verbose=True)\n",
"# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n",
"onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # GPU mode"
],
"metadata": {
"id": "nSeG9uAZrcph",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "1186f909-6cfb-400b-c2d9-3dddc93d318b"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start convert visual model\n",
"[CLIP ONNX] Start check visual model\n",
"[CLIP ONNX] Start convert textual model\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/torch/onnx/symbolic_opset9.py:2909: UserWarning: Exporting aten::index operator of advanced indexing in opset 12 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n",
" \"If indices include negative values, the exported graph will produce incorrect results.\")\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start check textual model\n",
"[CLIP ONNX] Models converts successfully\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"onnx_model = clip_onnx(model)\n",
"onnx_model.load_onnx(\"/content/clip_visual.onnx\",\n",
" \"/content/clip_textual.onnx\",\n",
" model.logit_scale.exp())\n",
"onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # GPU mode"
],
"metadata": {
"id": "PsDS7ty79zZf"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"onnx_model.visual_session.get_providers()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aZsGJNrbNCYe",
"outputId": "05464d1a-7047-4efd-80fe-32870cf34afd"
},
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['CUDAExecutionProvider', 'CPUExecutionProvider']"
]
},
"metadata": {},
"execution_count": 6
}
]
},
{
"cell_type": "markdown",
"source": [
"## Benchmark"
],
"metadata": {
"id": "J5IcOG_6jAFz"
}
},
{
"cell_type": "code",
"source": [
"model, preprocess = clip.load(\"ViT-B/32\", device=\"cuda\", jit=False)"
],
"metadata": {
"id": "SJ_5_x7vLepK"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model.eval()\n",
"for x in model.parameters():\n",
" x.requires_grad = False"
],
"metadata": {
"id": "OnOzZ3LMuubW"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import numpy, random, torch"
],
"metadata": {
"id": "wDwqRRrTGKUS"
},
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def set_seed():\n",
" torch.manual_seed(12)\n",
" torch.cuda.manual_seed(12)\n",
" np.random.seed(12)\n",
" random.seed(12)\n",
"\n",
" torch.backends.cudnn.deterministic=True"
],
"metadata": {
"id": "9H17n_6gGJgT"
},
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import torch\n",
"import time\n",
"\n",
"n = 5\n",
"clip_results = {\"encode_image\": [],\n",
" \"encode_text\": []}\n",
"onnx_results = {\"encode_image\": [],\n",
" \"encode_text\": []}\n",
"for batch in [2, 8, 16, 32, 64]:\n",
" set_seed()\n",
" t_mean = []\n",
" for _ in range(n):\n",
" image_input = torch.randint(1, 255, (batch, 3, 224, 224))\n",
" image_input_onnx = image_input.detach().cpu().numpy().astype(np.float32)\n",
" t = time.time()\n",
" onnx_model.encode_image(image_input_onnx)\n",
" t_mean.append(time.time() - t)\n",
" print(\"onnx\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" onnx_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" set_seed()\n",
" with torch.inference_mode():\n",
" t_mean = []\n",
" for _ in range(n):\n",
" image_input = torch.randint(1, 255, (batch, 3, 224, 224)).cuda()\n",
" t = time.time()\n",
" model.encode_image(image_input)\n",
" t_mean.append(time.time() - t)\n",
" print(\"torch\", batch, \"encode_image\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" clip_results[\"encode_image\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" set_seed()\n",
" t_mean = []\n",
" for _ in range(n):\n",
" text_input = torch.randint(320, 49407, (batch, 77))\n",
" text_input_onnx = text_input.detach().cpu().numpy().astype(np.int32)\n",
" t = time.time()\n",
" onnx_model.encode_text(text_input_onnx)\n",
" t_mean.append(time.time() - t)\n",
" print(\"onnx\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" onnx_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" set_seed()\n",
" with torch.inference_mode():\n",
" t_mean = []\n",
" for _ in range(n):\n",
" text_input = torch.randint(320, 49407, (batch, 77)).cuda()\n",
" t = time.time()\n",
" model.encode_text(text_input)\n",
" t_mean.append(time.time() - t)\n",
" print(\"torch\", batch, \"encode_text\", round(sum(t_mean) / n, 3))\n",
" torch.cuda.empty_cache()\n",
" clip_results[\"encode_text\"].append([batch, round(sum(t_mean) / n, 3)])\n",
"\n",
" print(\"-\" * 78)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4lFL6tzWjiWL",
"outputId": "c2b9f0e4-9b93-408b-96bf-3fdb3057e15b"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"onnx 2 encode_image 0.155\n",
"torch 2 encode_image 0.017\n",
"onnx 2 encode_text 0.01\n",
"torch 2 encode_text 0.009\n",
"------------------------------------------------------------------------------\n",
"onnx 8 encode_image 0.032\n",
"torch 8 encode_image 0.008\n",
"onnx 8 encode_text 0.014\n",
"torch 8 encode_text 0.008\n",
"------------------------------------------------------------------------------\n",
"onnx 16 encode_image 0.037\n",
"torch 16 encode_image 0.009\n",
"onnx 16 encode_text 0.029\n",
"torch 16 encode_text 0.012\n",
"------------------------------------------------------------------------------\n",
"onnx 32 encode_image 0.076\n",
"torch 32 encode_image 0.008\n",
"onnx 32 encode_text 0.059\n",
"torch 32 encode_text 0.025\n",
"------------------------------------------------------------------------------\n",
"onnx 64 encode_image 0.169\n",
"torch 64 encode_image 0.009\n",
"onnx 64 encode_text 0.117\n",
"torch 64 encode_text 0.049\n",
"------------------------------------------------------------------------------\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd"
],
"metadata": {
"id": "P2YhbE9v_4ci"
},
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pd.DataFrame({\"backend\": [\"onnx\", \"torch\"] * 5,\n",
" \"batch\": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],\n",
" \"encode_image\": [j[1] for i in zip(onnx_results[\"encode_image\"],\n",
" clip_results[\"encode_image\"]) for j in i],\n",
" \"encode_text\": [j[1] for i in zip(onnx_results[\"encode_text\"],\n",
" clip_results[\"encode_text\"]) for j in i]})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 362
},
"id": "WfZfDk4PAlqm",
"outputId": "3375eac7-47b0-40ba-c2d6-c30fda2ab6d5"
},
"execution_count": 13,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" backend batch encode_image encode_text\n",
"0 onnx 2 0.155 0.010\n",
"1 torch 2 0.017 0.009\n",
"2 onnx 8 0.032 0.014\n",
"3 torch 8 0.008 0.008\n",
"4 onnx 16 0.037 0.029\n",
"5 torch 16 0.009 0.012\n",
"6 onnx 32 0.076 0.059\n",
"7 torch 32 0.008 0.025\n",
"8 onnx 64 0.169 0.117\n",
"9 torch 64 0.009 0.049"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" backend | \n",
" batch | \n",
" encode_image | \n",
" encode_text | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" onnx | \n",
" 2 | \n",
" 0.155 | \n",
" 0.010 | \n",
"
\n",
" \n",
" | 1 | \n",
" torch | \n",
" 2 | \n",
" 0.017 | \n",
" 0.009 | \n",
"
\n",
" \n",
" | 2 | \n",
" onnx | \n",
" 8 | \n",
" 0.032 | \n",
" 0.014 | \n",
"
\n",
" \n",
" | 3 | \n",
" torch | \n",
" 8 | \n",
" 0.008 | \n",
" 0.008 | \n",
"
\n",
" \n",
" | 4 | \n",
" onnx | \n",
" 16 | \n",
" 0.037 | \n",
" 0.029 | \n",
"
\n",
" \n",
" | 5 | \n",
" torch | \n",
" 16 | \n",
" 0.009 | \n",
" 0.012 | \n",
"
\n",
" \n",
" | 6 | \n",
" onnx | \n",
" 32 | \n",
" 0.076 | \n",
" 0.059 | \n",
"
\n",
" \n",
" | 7 | \n",
" torch | \n",
" 32 | \n",
" 0.008 | \n",
" 0.025 | \n",
"
\n",
" \n",
" | 8 | \n",
" onnx | \n",
" 64 | \n",
" 0.169 | \n",
" 0.117 | \n",
"
\n",
" \n",
" | 9 | \n",
" torch | \n",
" 64 | \n",
" 0.009 | \n",
" 0.049 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 13
}
]
},
{
"cell_type": "code",
"source": [
"onnx_df = pd.DataFrame({\"ONNX\": [\"ViT-B/32\"] * 5,\n",
" \"batch\": [2, 8, 16, 32, 64],\n",
" \"encode_image\": [i[1] for i in onnx_results[\"encode_image\"]],\n",
" \"encode_text\": [i[1] for i in onnx_results[\"encode_text\"]]})\n",
"onnx_df[\"total\"] = onnx_df[\"encode_image\"] + onnx_df[\"encode_text\"]"
],
"metadata": {
"id": "Xpw9lV7yBbA8"
},
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"source": [
"onnx_df"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 206
},
"id": "LItAyQkeDhnQ",
"outputId": "e6c88747-5eba-4c16-be40-d4de584f429e"
},
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" ONNX batch encode_image encode_text total\n",
"0 ViT-B/32 2 0.155 0.010 0.165\n",
"1 ViT-B/32 8 0.032 0.014 0.046\n",
"2 ViT-B/32 16 0.037 0.029 0.066\n",
"3 ViT-B/32 32 0.076 0.059 0.135\n",
"4 ViT-B/32 64 0.169 0.117 0.286"
],
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ONNX | \n",
" batch | \n",
" encode_image | \n",
" encode_text | \n",
" total | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" ViT-B/32 | \n",
" 2 | \n",
" 0.155 | \n",
" 0.010 | \n",
" 0.165 | \n",
"
\n",
" \n",
" | 1 | \n",
" ViT-B/32 | \n",
" 8 | \n",
" 0.032 | \n",
" 0.014 | \n",
" 0.046 | \n",
"
\n",
" \n",
" | 2 | \n",
" ViT-B/32 | \n",
" 16 | \n",
" 0.037 | \n",
" 0.029 | \n",
" 0.066 | \n",
"
\n",
" \n",
" | 3 | \n",
" ViT-B/32 | \n",
" 32 | \n",
" 0.076 | \n",
" 0.059 | \n",
" 0.135 | \n",
"
\n",
" \n",
" | 4 | \n",
" ViT-B/32 | \n",
" 64 | \n",
" 0.169 | \n",
" 0.117 | \n",
" 0.286 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "code",
"source": [
"print(onnx_df.to_markdown(index=False))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "AIQDA9FaJZ7Y",
"outputId": "8b197c3c-63d1-42c4-8ca3-a3258acfc878"
},
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"| ONNX | batch | encode_image | encode_text | total |\n",
"|:---------|--------:|---------------:|--------------:|--------:|\n",
"| ViT-B/32 | 2 | 0.155 | 0.01 | 0.165 |\n",
"| ViT-B/32 | 8 | 0.032 | 0.014 | 0.046 |\n",
"| ViT-B/32 | 16 | 0.037 | 0.029 | 0.066 |\n",
"| ViT-B/32 | 32 | 0.076 | 0.059 | 0.135 |\n",
"| ViT-B/32 | 64 | 0.169 | 0.117 | 0.286 |\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"clip_df = pd.DataFrame({\"TORCH\": [\"ViT-B/32\"] * 5,\n",
" \"batch\": [2, 8, 16, 32, 64],\n",
" \"encode_image\": [i[1] for i in clip_results[\"encode_image\"]],\n",
" \"encode_text\": [i[1] for i in clip_results[\"encode_text\"]]})\n",
"clip_df[\"total\"] = clip_df[\"encode_image\"] + clip_df[\"encode_text\"]"
],
"metadata": {
"id": "E1OXQUDvDZmI"
},
"execution_count": 17,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(clip_df.to_markdown(index=False))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xAj-ynhCDpPO",
"outputId": "f90bc132-4727-45df-a6c2-49e2a68e0a4a"
},
"execution_count": 18,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"| TORCH | batch | encode_image | encode_text | total |\n",
"|:---------|--------:|---------------:|--------------:|--------:|\n",
"| ViT-B/32 | 2 | 0.017 | 0.009 | 0.026 |\n",
"| ViT-B/32 | 8 | 0.008 | 0.008 | 0.016 |\n",
"| ViT-B/32 | 16 | 0.009 | 0.012 | 0.021 |\n",
"| ViT-B/32 | 32 | 0.008 | 0.025 | 0.033 |\n",
"| ViT-B/32 | 64 | 0.009 | 0.049 | 0.058 |\n"
]
}
]
}
]
}
================================================
FILE: examples/readme_example.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "readme_example.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyPpME0Qdi/m3VZQ+jNj39dT",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"source": [
"## Restart colab session after installation\n",
"Reload the session if something doesn't work"
],
"metadata": {
"id": "whlsBiJgR8le"
}
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!pip install git+https://github.com/Lednik7/CLIP-ONNX.git\n",
"!pip install git+https://github.com/openai/CLIP.git\n",
"!pip install onnxruntime-gpu"
],
"metadata": {
"id": "HnbpAkvuR73L"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true"
],
"metadata": {
"id": "tqy0zKM4R-7M"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!nvidia-smi # CPU Provider"
],
"metadata": {
"id": "eKqETHL4YscZ",
"outputId": "7ff0bc18-fb40-4296-ab05-b079043e46a1",
"colab": {
"base_uri": "https://localhost:8080/"
}
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import onnxruntime\n",
"\n",
"print(onnxruntime.get_device()) # priority device"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "x8IN72OnSAIh",
"outputId": "81d14047-91fa-4a5c-a1e3-f5b550556591"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"CPU\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## CPU inference mode"
],
"metadata": {
"id": "U1Pr-YTtSEhs"
}
},
{
"cell_type": "code",
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\", category=UserWarning)"
],
"metadata": {
"id": "gZTxanR26knr"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import clip\n",
"from PIL import Image\n",
"import numpy as np\n",
"\n",
"# onnx cannot export with cuda\n",
"model, preprocess = clip.load(\"ViT-B/32\", device=\"cpu\", jit=False)\n",
"\n",
"# batch first\n",
"image = preprocess(Image.open(\"CLIP.png\")).unsqueeze(0).cpu() # [1, 3, 224, 224]\n",
"image_onnx = image.detach().cpu().numpy().astype(np.float32)\n",
"\n",
"# batch first\n",
"text = clip.tokenize([\"a diagram\", \"a dog\", \"a cat\"]).cpu() # [3, 77]\n",
"text_onnx = text.detach().cpu().numpy().astype(np.int32)"
],
"metadata": {
"id": "rPwc6A2SSGyl"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from clip_onnx import clip_onnx, attention\n",
"# clip.model.ResidualAttentionBlock.attention = attention\n",
"\n",
"visual_path = \"clip_visual.onnx\"\n",
"textual_path = \"clip_textual.onnx\"\n",
"\n",
"onnx_model = clip_onnx(model, visual_path=visual_path, textual_path=textual_path)\n",
"onnx_model.convert2onnx(image, text, verbose=True)\n",
"# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n",
"onnx_model.start_sessions(providers=[\"CPUExecutionProvider\"]) # cpu mode"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oYM5FDSGSJBW",
"outputId": "816705b1-3829-4424-c7c4-5426cf21cc18"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start convert visual model\n",
"[CLIP ONNX] Start check visual model\n",
"[CLIP ONNX] Start convert textual model\n",
"[CLIP ONNX] Start check textual model\n",
"[CLIP ONNX] Models converts successfully\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"image_features = onnx_model.encode_image(image_onnx)\n",
"text_features = onnx_model.encode_text(text_onnx)\n",
"\n",
"logits_per_image, logits_per_text = onnx_model(image_onnx, text_onnx)\n",
"probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()\n",
"\n",
"print(\"Label probs:\", probs) # prints: [[0.9927937 0.00421067 0.00299571]]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tYVuk72nSLw6",
"outputId": "41608059-3732-4ea7-c619-66f803af4185"
},
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Label probs: [[0.9927937 0.00421067 0.00299571]]\n"
]
}
]
}
]
}
================================================
FILE: examples/ru_CLIP_tiny_onnx.ipynb
================================================
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "ru_CLIP_tiny_onnx.ipynb",
"provenance": [],
"collapsed_sections": [
"WWXCt_2NLhN_",
"PHb4CAoRL3qC",
"re2sSYAYO3D-",
"ithu4-z0PIm5",
"FWm0GAhWPzSW"
],
"machine_shape": "hm"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"5319c7971f234d4bb615508f76475f9e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_c43027a0735e459ca1f710e5a9c43177",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_c00c959249db4f2a9b97adabd2684c3c",
"IPY_MODEL_d9e4edd05c1e40f991eb2c2f1fc9ebc1",
"IPY_MODEL_ab4928c0a86449a384e36d8c0bc25717"
]
}
},
"c43027a0735e459ca1f710e5a9c43177": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"c00c959249db4f2a9b97adabd2684c3c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_3251223dac8f43c081701ff7f663cb35",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_d63d5559ce534b86969132d3ff8d875b"
}
},
"d9e4edd05c1e40f991eb2c2f1fc9ebc1": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_59618f021fc4495e9c401a421d28d4a0",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 381781,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 381781,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_c56ba935682647dca4bdcc593fe0d2cc"
}
},
"ab4928c0a86449a384e36d8c0bc25717": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_01808c7fec8447368d60a33b2d683851",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 373k/373k [00:00<00:00, 876kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_f9466e0349c84633a0fb8ceeffa2a984"
}
},
"3251223dac8f43c081701ff7f663cb35": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"d63d5559ce534b86969132d3ff8d875b": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"59618f021fc4495e9c401a421d28d4a0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"c56ba935682647dca4bdcc593fe0d2cc": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"01808c7fec8447368d60a33b2d683851": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"f9466e0349c84633a0fb8ceeffa2a984": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"10ee9777b41e42129e2c9cc9327ad88f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_cb6a647757244da3941602127ec38ccb",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_a64a223312144f2f9736729b63ab1ce5",
"IPY_MODEL_7e7bce13eeed41179e4e15fc7afc89d5",
"IPY_MODEL_7bacd13c23cf415fa5d58e9243c4a785"
]
}
},
"cb6a647757244da3941602127ec38ccb": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"a64a223312144f2f9736729b63ab1ce5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_9fe6e1167e5d45fbad2adab3d59e017d",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_655c507d8fcf423f8bd6746201f569ae"
}
},
"7e7bce13eeed41179e4e15fc7afc89d5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_8c4812afaaec4d65bf84a1e77840d356",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 112,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 112,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_64dfa71e3dff4236908e0592e4f90250"
}
},
"7bacd13c23cf415fa5d58e9243c4a785": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_ec8d98c1edb148d3ae1c518b61e8155b",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 112/112 [00:00<00:00, 3.41kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_a8212828565d4c9884d44fb45dc51ee5"
}
},
"9fe6e1167e5d45fbad2adab3d59e017d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"655c507d8fcf423f8bd6746201f569ae": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"8c4812afaaec4d65bf84a1e77840d356": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"64dfa71e3dff4236908e0592e4f90250": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"ec8d98c1edb148d3ae1c518b61e8155b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"a8212828565d4c9884d44fb45dc51ee5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9a2d4d7da3024cc0828b1a6dafd0dd16": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_fe4daa4d7d024187aa2f622dbf3577a8",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_8380bf9a899645e8aef576e640b41ea2",
"IPY_MODEL_37c593d2f442497483cd0026498bab05",
"IPY_MODEL_3cc7b132c94f427ba44858e4c4ce3019"
]
}
},
"fe4daa4d7d024187aa2f622dbf3577a8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"8380bf9a899645e8aef576e640b41ea2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_d2aed3c0f95b4677bd6e949a4ed0403e",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_bbc52e0e0b2f4758bd7d6cf44b4670ae"
}
},
"37c593d2f442497483cd0026498bab05": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_8513d262d8764d99aa5d3f2f178b875e",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 239,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 239,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_57984bbd46a84a7bb2b7629e6b2f9ef9"
}
},
"3cc7b132c94f427ba44858e4c4ce3019": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_b886e2b6bbcd46cf806ff3a0b3cb8d33",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 239/239 [00:00<00:00, 5.49kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_21ac91113f7e4548b416a32b1b3f66a9"
}
},
"d2aed3c0f95b4677bd6e949a4ed0403e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"bbc52e0e0b2f4758bd7d6cf44b4670ae": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"8513d262d8764d99aa5d3f2f178b875e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"57984bbd46a84a7bb2b7629e6b2f9ef9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b886e2b6bbcd46cf806ff3a0b3cb8d33": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"21ac91113f7e4548b416a32b1b3f66a9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"f8958c6de2394fecab9f95388a365431": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_11a8a4b2d39d4ea8904c0f1b2f6dd906",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_9657501af7514a60b30fcd60a223980c",
"IPY_MODEL_61274b2bac5e4835a8bd33dc201bc155",
"IPY_MODEL_973300b095554b10ac290244772e0a6f"
]
}
},
"11a8a4b2d39d4ea8904c0f1b2f6dd906": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9657501af7514a60b30fcd60a223980c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_c3f5f56bb14d44b6a5775a77f6763b94",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": "Downloading: 100%",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_fbf430940c8a49949953155b57d07766"
}
},
"61274b2bac5e4835a8bd33dc201bc155": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_ab05641bcb9c49aab977110fab503a78",
"_dom_classes": [],
"description": "",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 175,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 175,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_4f11a71d7df943e48ac9ea3bab5c6771"
}
},
"973300b095554b10ac290244772e0a6f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_b6004e09152045e18503cf75e32d4fa6",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 175/175 [00:00<00:00, 5.41kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_590fb707d26948b5b9c8bb3b896f29e1"
}
},
"c3f5f56bb14d44b6a5775a77f6763b94": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"fbf430940c8a49949953155b57d07766": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"ab05641bcb9c49aab977110fab503a78": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"4f11a71d7df943e48ac9ea3bab5c6771": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b6004e09152045e18503cf75e32d4fa6": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"590fb707d26948b5b9c8bb3b896f29e1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# [](https://colab.research.google.com/github/cene555/ru-clip-tiny/blob/main/notebooks/ru_CLIP_tiny_onnx.ipynb)"
],
"metadata": {
"id": "JsWuTduwaagq"
}
},
{
"cell_type": "markdown",
"source": [
"## Select a runtime GPU to continue:\n",
"\n",
"Click Runtime -> Change Runtime Type -> switch \"Harware accelerator\" to be GPU. Save it, and you maybe connect to GPU"
],
"metadata": {
"id": "VCCzmQdKJPkv"
}
},
{
"cell_type": "code",
"source": [
"#@title Allowed Resources\n",
"import multiprocessing\n",
"import torch\n",
"from psutil import virtual_memory\n",
"\n",
"ram_gb = round(virtual_memory().total / 1024**3, 1)\n",
"\n",
"print('CPU:', multiprocessing.cpu_count())\n",
"print('RAM GB:', ram_gb)\n",
"print(\"PyTorch version:\", torch.__version__)\n",
"print(\"CUDA version:\", torch.version.cuda)\n",
"print(\"cuDNN version:\", torch.backends.cudnn.version())\n",
"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
"print(\"device:\", device.type)\n",
"\n",
"!nvidia-smi"
],
"metadata": {
"cellView": "form",
"id": "6xdy_cPJEYXV",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "9b5c5751-3377-4623-fd90-f59c21118c80"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"CPU: 2\n",
"RAM GB: 12.7\n",
"PyTorch version: 1.10.0+cu111\n",
"CUDA version: 11.1\n",
"cuDNN version: 8005\n",
"device: cuda\n",
"Tue Feb 1 17:26:24 2022 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 495.46 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 61C P8 11W / 70W | 3MiB / 15109MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## Restart colab session after installation\n",
"Reload session if something doesn't work (may need multiple times)"
],
"metadata": {
"id": "hmNP7iJBj6XZ"
}
},
{
"cell_type": "markdown",
"source": [
"## Install requirements"
],
"metadata": {
"id": "WWXCt_2NLhN_"
}
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!gdown -O ru-clip-tiny.pkl https://drive.google.com/uc?id=1-3g3J90pZmHo9jbBzsEmr7ei5zm3VXOL\n",
"\n",
"!pip install git+https://github.com/cene555/ru-clip-tiny.git\n",
"!pip install git+https://github.com/Lednik7/CLIP-ONNX.git\n",
"!pip install onnxruntime-gpu\n",
"\n",
"!wget -c -O CLIP.png https://github.com/openai/CLIP/blob/main/CLIP.png?raw=true"
],
"metadata": {
"id": "FWEEtd7Vryaf"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import onnxruntime\n",
"\n",
"# priority device (if available)\n",
"print(onnxruntime.get_device())"
],
"metadata": {
"id": "bUFx02Dhjap4",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "f595c387-da47-47e5-f96a-2d84adf3286b"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"GPU\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## Import libraries"
],
"metadata": {
"id": "PHb4CAoRL3qC"
}
},
{
"cell_type": "code",
"source": [
"import torch\n",
"from rucliptiny import RuCLIPtiny\n",
"from rucliptiny.utils import get_transform\n",
"from rucliptiny.tokenizer import Tokenizer"
],
"metadata": {
"id": "cznZ7ozDL5-M"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\", category=UserWarning)"
],
"metadata": {
"id": "57COx0BKCmFA"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Load model"
],
"metadata": {
"id": "ithu4-z0PIm5"
}
},
{
"cell_type": "code",
"source": [
"#@title speed_test function\n",
"\n",
"import time\n",
"\n",
"def speed_test(func, data_gen, n=5, empty_cache=True, is_text=False,\n",
" first_run=True):\n",
" if empty_cache: torch.cuda.empty_cache()\n",
" if first_run:\n",
" if is_text:\n",
" input_data1, input_data2 = data_gen()\n",
" func(input_data1, input_data2)\n",
" else:\n",
" input_data = data_gen()\n",
" func(input_data)\n",
" torch.cuda.empty_cache()\n",
" \n",
" values = []\n",
" for _ in range(n):\n",
" if is_text:\n",
" input_data1, input_data2 = data_gen()\n",
" else:\n",
" input_data = data_gen()\n",
" if is_text:\n",
" t = time.time()\n",
" func(input_data1, input_data2)\n",
" else:\n",
" t = time.time()\n",
" func(input_data)\n",
" values.append(time.time() - t)\n",
" if empty_cache: torch.cuda.empty_cache()\n",
" return sum(values) / n"
],
"metadata": {
"id": "GqKM04tP4Vv3",
"cellView": "form"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"torch.manual_seed(1)\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
],
"metadata": {
"id": "SSOHYDRQGif-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model = RuCLIPtiny()\n",
"model.load_state_dict(torch.load('ru-clip-tiny.pkl',\n",
" map_location=device))\n",
"model = model.to(device).eval()\n",
"for x in model.parameters(): x.requires_grad = False\n",
"torch.cuda.empty_cache()"
],
"metadata": {
"id": "OpFAZfq-_nJe"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"transforms = get_transform()\n",
"tokenizer = Tokenizer()"
],
"metadata": {
"id": "KEZj2WrwkzZz",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 145,
"referenced_widgets": [
"5319c7971f234d4bb615508f76475f9e",
"c43027a0735e459ca1f710e5a9c43177",
"c00c959249db4f2a9b97adabd2684c3c",
"d9e4edd05c1e40f991eb2c2f1fc9ebc1",
"ab4928c0a86449a384e36d8c0bc25717",
"3251223dac8f43c081701ff7f663cb35",
"d63d5559ce534b86969132d3ff8d875b",
"59618f021fc4495e9c401a421d28d4a0",
"c56ba935682647dca4bdcc593fe0d2cc",
"01808c7fec8447368d60a33b2d683851",
"f9466e0349c84633a0fb8ceeffa2a984",
"10ee9777b41e42129e2c9cc9327ad88f",
"cb6a647757244da3941602127ec38ccb",
"a64a223312144f2f9736729b63ab1ce5",
"7e7bce13eeed41179e4e15fc7afc89d5",
"7bacd13c23cf415fa5d58e9243c4a785",
"9fe6e1167e5d45fbad2adab3d59e017d",
"655c507d8fcf423f8bd6746201f569ae",
"8c4812afaaec4d65bf84a1e77840d356",
"64dfa71e3dff4236908e0592e4f90250",
"ec8d98c1edb148d3ae1c518b61e8155b",
"a8212828565d4c9884d44fb45dc51ee5",
"9a2d4d7da3024cc0828b1a6dafd0dd16",
"fe4daa4d7d024187aa2f622dbf3577a8",
"8380bf9a899645e8aef576e640b41ea2",
"37c593d2f442497483cd0026498bab05",
"3cc7b132c94f427ba44858e4c4ce3019",
"d2aed3c0f95b4677bd6e949a4ed0403e",
"bbc52e0e0b2f4758bd7d6cf44b4670ae",
"8513d262d8764d99aa5d3f2f178b875e",
"57984bbd46a84a7bb2b7629e6b2f9ef9",
"b886e2b6bbcd46cf806ff3a0b3cb8d33",
"21ac91113f7e4548b416a32b1b3f66a9",
"f8958c6de2394fecab9f95388a365431",
"11a8a4b2d39d4ea8904c0f1b2f6dd906",
"9657501af7514a60b30fcd60a223980c",
"61274b2bac5e4835a8bd33dc201bc155",
"973300b095554b10ac290244772e0a6f",
"c3f5f56bb14d44b6a5775a77f6763b94",
"fbf430940c8a49949953155b57d07766",
"ab05641bcb9c49aab977110fab503a78",
"4f11a71d7df943e48ac9ea3bab5c6771",
"b6004e09152045e18503cf75e32d4fa6",
"590fb707d26948b5b9c8bb3b896f29e1"
]
},
"outputId": "466854ba-7fa2-4154-ada2-391626146c95"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5319c7971f234d4bb615508f76475f9e",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/373k [00:00, ?B/s]"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "10ee9777b41e42129e2c9cc9327ad88f",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/112 [00:00, ?B/s]"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9a2d4d7da3024cc0828b1a6dafd0dd16",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/239 [00:00, ?B/s]"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f8958c6de2394fecab9f95388a365431",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"Downloading: 0%| | 0.00/175 [00:00, ?B/s]"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"source": [
"## [Speed test] Batch 64"
],
"metadata": {
"id": "BGsIitOkCCLE"
}
},
{
"cell_type": "code",
"source": [
"speed_test(model.encode_image, lambda: torch.randint(1, 255, (64, 3, 224, 224)).to(device))"
],
"metadata": {
"id": "5Ii9OlgUjR9J",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "21f1c03c-3e45-4650-d892-be2d83021d21"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.011787748336791993"
]
},
"metadata": {},
"execution_count": 7
}
]
},
{
"cell_type": "code",
"source": [
"speed_test(model.encode_text,\n",
" lambda: (torch.randint(1, 255, (64, 77)).to(device),\n",
" torch.randint(0, 2, (64, 77)).to(device)),\n",
" is_text=True)"
],
"metadata": {
"id": "3Ho_rGd6j0_8",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "1026df31-ea4b-4f50-e76d-f300bee0299a"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.004021787643432617"
]
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "markdown",
"source": [
"## Prepare functions"
],
"metadata": {
"id": "81uWLBrMkl3T"
}
},
{
"cell_type": "code",
"source": [
"from PIL import Image\n",
"import numpy as np"
],
"metadata": {
"id": "ry5BqVbzk-gM"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# batch first\n",
"image = transforms(Image.open(\"CLIP.png\")).unsqueeze(0).cpu() # [1, 3, 224, 224]\n",
"\n",
"# batch first\n",
"texts = ['диаграмма', 'собака', 'кошка']\n",
"text_tokens, attention_mask = tokenizer.tokenize(texts, max_len=77)\n",
"text_tokens, attention_mask = text_tokens.cpu(), attention_mask.cpu() # [3, 77]\n",
"\n",
"# batch second\n",
"dummy_input_text = torch.stack([text_tokens, attention_mask]).detach().cpu()"
],
"metadata": {
"id": "jyE4C7nIkT_5"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"text_tokens_onnx = text_tokens.detach().cpu().numpy().astype(np.int64)\n",
"attention_mask_onnx = attention_mask.detach().cpu().numpy().astype(np.int64)\n",
"\n",
"image_onnx = image.detach().cpu().numpy().astype(np.float32)\n",
"text_onnx = torch.stack([text_tokens, attention_mask]).detach().cpu()\\\n",
" .numpy().astype(np.int64)"
],
"metadata": {
"id": "9SJJmuuWlSjS"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## Convert RuCLIP model to ONNX"
],
"metadata": {
"id": "Y7V4BjOGkRcu"
}
},
{
"cell_type": "code",
"source": [
"class Textual(torch.nn.Module):\n",
" def __init__(self, model):\n",
" super().__init__()\n",
" self.model = model\n",
"\n",
" def forward(self, input_data):\n",
" input_ids, attention_mask = input_data\n",
" x = self.model.transformer(input_ids=input_ids, attention_mask=attention_mask)\n",
" x = x.last_hidden_state[:, 0, :]\n",
" x = self.model.final_ln(x)\n",
" return x"
],
"metadata": {
"id": "HzGiuIo8m341"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from clip_onnx import clip_onnx\n",
"from clip_onnx.utils import DEFAULT_EXPORT\n",
"\n",
"visual_path = \"clip_visual.onnx\"\n",
"textual_path = \"clip_textual.onnx\"\n",
"\n",
"textual_export_params = DEFAULT_EXPORT.copy()\n",
"textual_export_params[\"dynamic_axes\"] = {'input': {1: 'batch_size'},\n",
" 'output': {0: 'batch_size'}}\n",
"\n",
"onnx_model = clip_onnx(model.cpu(), visual_path=visual_path, textual_path=textual_path)\n",
"onnx_model.convert2onnx(image, dummy_input_text, verbose=True,\n",
" textual_wrapper=Textual,\n",
" textual_export_params=textual_export_params)"
],
"metadata": {
"id": "k5eQK8gJla5a",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "09ec9b1d-70f0-4d01-87be-5bb622c14e89"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start convert visual model\n",
"[CLIP ONNX] Start check visual model\n",
"[CLIP ONNX] Start convert textual model\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:7: TracerWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results).\n",
" import sys\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"[CLIP ONNX] Start check textual model\n",
"[CLIP ONNX] Models converts successfully\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"## [ONNX] CUDA inference mode"
],
"metadata": {
"id": "QQ0A0gUFzQr-"
}
},
{
"cell_type": "code",
"source": [
"# Optional cell, can be skipped\n",
"\n",
"visual_path = \"clip_visual.onnx\"\n",
"textual_path = \"clip_textual.onnx\"\n",
"\n",
"onnx_model.load_onnx(visual_path,\n",
" textual_path,\n",
" 29.9119) # model.logit_scale.exp()"
],
"metadata": {
"id": "YR-Pv3E8q_mz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']\n",
"onnx_model.start_sessions(providers=[\"CUDAExecutionProvider\"]) # cuda mode"
],
"metadata": {
"id": "J2qxXvmfo2eu"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"onnx_model.visual_session.get_providers()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yq05H9f7vyQy",
"outputId": "2c39c48b-db02-4610-addd-901429497a43"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['CUDAExecutionProvider', 'CPUExecutionProvider']"
]
},
"metadata": {},
"execution_count": 16
}
]
},
{
"cell_type": "markdown",
"source": [
"## [Speed test] Batch 64"
],
"metadata": {
"id": "EieJHr_CA2ui"
}
},
{
"cell_type": "code",
"source": [
"speed_test(onnx_model.encode_image,\n",
" lambda: np.random.uniform(1, 255, (64, 3, 224, 224))\\\n",
" .astype(np.float32))"
],
"metadata": {
"id": "kyF8lyTXnwCz",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "d675b77b-7979-44e6-f7d9-45013a1b17b8"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.28517956733703614"
]
},
"metadata": {},
"execution_count": 17
}
]
},
{
"cell_type": "code",
"source": [
"speed_test(onnx_model.encode_text,\n",
" lambda: np.stack([np.random.randint(1, 255, (64, 77)),\n",
" np.random.randint(0, 2, (64, 77))]))"
],
"metadata": {
"id": "AmShwsCtoYte",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "9cd94020-d813-4ddd-cd74-cb6d7d922930"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0.012344837188720703"
]
},
"metadata": {},
"execution_count": 18
}
]
},
{
"cell_type": "markdown",
"source": [
"## [Speed test] Compare Pytorch and ONNX"
],
"metadata": {
"id": "zejMPUDCB2Mi"
}
},
{
"cell_type": "code",
"source": [
"import random\n",
"import torch\n",
"import time\n",
"\n",
"def set_seed():\n",
" torch.manual_seed(12)\n",
" torch.cuda.manual_seed(12)\n",
" np.random.seed(12)\n",
" random.seed(12)\n",
"\n",
" torch.backends.cudnn.deterministic=True"
],
"metadata": {
"id": "HqLSjsiGCJXW"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"n = 20\n",
"model = model.to(device)\n",
"\n",
"clip_results = {\"encode_image\": [],\n",
" \"encode_text\": []}\n",
"\n",
"onnx_results = {\"encode_image\": [],\n",
" \"encode_text\": []}\n",
" \n",
"for batch in [2, 8, 16, 32, 64]:\n",
" set_seed()\n",
" result = speed_test(onnx_model.encode_image,\n",
" lambda: np.random.uniform(1, 255, (batch, 3, 224, 224))\\\n",
" .astype(np.float32), n=n)\n",
" result = round(result, 3)\n",
" onnx_results[\"encode_image\"].append([batch, result])\n",
" print(\"onnx\", batch, \"encode_image\", result)\n",
"\n",
" set_seed()\n",
" with torch.inference_mode():\n",
" result = speed_test(model.encode_image,\n",
" lambda: torch.randint(1, 255, (batch, 3, 224, 224))\\\n",
" .to(device), n=n)\n",
" result = round(result, 3)\n",
" print(\"torch\", batch, \"encode_image\", result)\n",
" clip_results[\"encode_image\"].append([batch, result])\n",
"\n",
" set_seed()\n",
" result = speed_test(onnx_model.encode_text,\n",
" lambda: np.stack([np.random.randint(1, 255, (batch, 77)),\n",
" np.random.randint(0, 2, (batch, 77))]),\n",
" n=n)\n",
" result = round(result, 3)\n",
" onnx_results[\"encode_text\"].append([batch, result])\n",
" print(\"onnx\", batch, \"encode_text\", result)\n",
"\n",
" set_seed()\n",
" with torch.inference_mode():\n",
" result = speed_test(model.encode_text,\n",
" lambda: (torch.randint(1, 255, (batch, 77)).to(device),\n",
" torch.randint(0, 2, (batch, 77)).to(device)),\n",
" is_text=True, n=n)\n",
" result = round(result, 3)\n",
" print(\"torch\", batch, \"encode_text\", result)\n",
" clip_results[\"encode_text\"].append([batch, result])\n",
"\n",
" print(\"-\" * 78)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YILIR6qMB_eb",
"outputId": "95e2c9a0-26bb-4203-f0e5-50589f44ddaf"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"onnx 2 encode_image 0.011\n",
"torch 2 encode_image 0.018\n",
"onnx 2 encode_text 0.001\n",
"torch 2 encode_text 0.003\n",
"------------------------------------------------------------------------------\n",
"onnx 8 encode_image 0.035\n",
"torch 8 encode_image 0.01\n",
"onnx 8 encode_text 0.002\n",
"torch 8 encode_text 0.003\n",
"------------------------------------------------------------------------------\n",
"onnx 16 encode_image 0.07\n",
"torch 16 encode_image 0.01\n",
"onnx 16 encode_text 0.004\n",
"torch 16 encode_text 0.003\n",
"------------------------------------------------------------------------------\n",
"onnx 32 encode_image 0.145\n",
"torch 32 encode_image 0.012\n",
"onnx 32 encode_text 0.007\n",
"torch 32 encode_text 0.004\n",
"------------------------------------------------------------------------------\n",
"onnx 64 encode_image 0.294\n",
"torch 64 encode_image 0.013\n",
"onnx 64 encode_text 0.014\n",
"torch 64 encode_text 0.005\n",
"------------------------------------------------------------------------------\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"pd.DataFrame({\"backend\": [\"onnx\", \"torch\"] * 5,\n",
" \"batch\": [2, 2, 8, 8, 16, 16, 32, 32, 64, 64],\n",
" \"encode_image\": [j[1] for i in zip(onnx_results[\"encode_image\"],\n",
" clip_results[\"encode_image\"]) for j in i],\n",
" \"encode_text\": [j[1] for i in zip(onnx_results[\"encode_text\"],\n",
" clip_results[\"encode_text\"]) for j in i]})"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 362
},
"id": "WAWUKqQOGd-2",
"outputId": "725a771f-1b75-4e3a-afa8-7ee7c9caac1f"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"\n",
" \n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" backend | \n",
" batch | \n",
" encode_image | \n",
" encode_text | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" onnx | \n",
" 2 | \n",
" 0.011 | \n",
" 0.001 | \n",
"
\n",
" \n",
" | 1 | \n",
" torch | \n",
" 2 | \n",
" 0.018 | \n",
" 0.003 | \n",
"
\n",
" \n",
" | 2 | \n",
" onnx | \n",
" 8 | \n",
" 0.035 | \n",
" 0.002 | \n",
"
\n",
" \n",
" | 3 | \n",
" torch | \n",
" 8 | \n",
" 0.010 | \n",
" 0.003 | \n",
"
\n",
" \n",
" | 4 | \n",
" onnx | \n",
" 16 | \n",
" 0.070 | \n",
" 0.004 | \n",
"
\n",
" \n",
" | 5 | \n",
" torch | \n",
" 16 | \n",
" 0.010 | \n",
" 0.003 | \n",
"
\n",
" \n",
" | 6 | \n",
" onnx | \n",
" 32 | \n",
" 0.145 | \n",
" 0.007 | \n",
"
\n",
" \n",
" | 7 | \n",
" torch | \n",
" 32 | \n",
" 0.012 | \n",
" 0.004 | \n",
"
\n",
" \n",
" | 8 | \n",
" onnx | \n",
" 64 | \n",
" 0.294 | \n",
" 0.014 | \n",
"
\n",
" \n",
" | 9 | \n",
" torch | \n",
" 64 | \n",
" 0.013 | \n",
" 0.005 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
],
"text/plain": [
" backend batch encode_image encode_text\n",
"0 onnx 2 0.011 0.001\n",
"1 torch 2 0.018 0.003\n",
"2 onnx 8 0.035 0.002\n",
"3 torch 8 0.010 0.003\n",
"4 onnx 16 0.070 0.004\n",
"5 torch 16 0.010 0.003\n",
"6 onnx 32 0.145 0.007\n",
"7 torch 32 0.012 0.004\n",
"8 onnx 64 0.294 0.014\n",
"9 torch 64 0.013 0.005"
]
},
"metadata": {},
"execution_count": 21
}
]
},
{
"cell_type": "code",
"source": [
"onnx_df = pd.DataFrame({\"ONNX\": [\"RuCLIPtiny\"] * 5,\n",
" \"batch\": [2, 8, 16, 32, 64],\n",
" \"encode_image\": [i[1] for i in onnx_results[\"encode_image\"]],\n",
" \"encode_text\": [i[1] for i in onnx_results[\"encode_text\"]]})\n",
"onnx_df[\"total\"] = onnx_df[\"encode_image\"] + onnx_df[\"encode_text\"]\n",
"\n",
"print(onnx_df.to_markdown(index=False))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ol9_RiUoG34e",
"outputId": "82be9e0e-b92e-4e3c-8132-9269eb22a41d"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"| ONNX | batch | encode_image | encode_text | total |\n",
"|:-----------|--------:|---------------:|--------------:|--------:|\n",
"| RuCLIPtiny | 2 | 0.011 | 0.001 | 0.012 |\n",
"| RuCLIPtiny | 8 | 0.035 | 0.002 | 0.037 |\n",
"| RuCLIPtiny | 16 | 0.07 | 0.004 | 0.074 |\n",
"| RuCLIPtiny | 32 | 0.145 | 0.007 | 0.152 |\n",
"| RuCLIPtiny | 64 | 0.294 | 0.014 | 0.308 |\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"clip_df = pd.DataFrame({\"TORCH\": [\"RuCLIPtiny\"] * 5,\n",
" \"batch\": [2, 8, 16, 32, 64],\n",
" \"encode_image\": [i[1] for i in clip_results[\"encode_image\"]],\n",
" \"encode_text\": [i[1] for i in clip_results[\"encode_text\"]]})\n",
"clip_df[\"total\"] = clip_df[\"encode_image\"] + clip_df[\"encode_text\"]\n",
"print(clip_df.to_markdown(index=False))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qw8ZK9XeG4LY",
"outputId": "326b24f9-9d21-47ed-d62c-d7594e786b96"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"| TORCH | batch | encode_image | encode_text | total |\n",
"|:-----------|--------:|---------------:|--------------:|--------:|\n",
"| RuCLIPtiny | 2 | 0.018 | 0.003 | 0.021 |\n",
"| RuCLIPtiny | 8 | 0.01 | 0.003 | 0.013 |\n",
"| RuCLIPtiny | 16 | 0.01 | 0.003 | 0.013 |\n",
"| RuCLIPtiny | 32 | 0.012 | 0.004 | 0.016 |\n",
"| RuCLIPtiny | 64 | 0.013 | 0.005 | 0.018 |\n"
]
}
]
}
]
}
================================================
FILE: requirements.txt
================================================
torch==1.13.1
onnxruntime>=1.11.1
onnx>=1.11.0
================================================
FILE: setup.py
================================================
import os
import pkg_resources
from setuptools import setup, find_packages
with open("requirements.txt", "r") as f:
install_requires = f.read().split("\n")
setup(
name="clip_onnx",
version="1.2",
py_modules=["clip_onnx, clip"],
description="",
author="Maxim Gerasimov",
packages=find_packages(),
install_requires=install_requires,
include_package_data=True
)