[
  {
    "path": "Export_CLIP_to_ONNX_tflite_tfjs_tf_saved_model.ipynb",
    "content": "{\n  \"nbformat\": 4,\n  \"nbformat_minor\": 0,\n  \"metadata\": {\n    \"colab\": {\n      \"name\": \"Export CLIP to ONNX/tflite/tfjs/tf saved model.ipynb\",\n      \"provenance\": [],\n      \"collapsed_sections\": []\n    },\n    \"kernelspec\": {\n      \"name\": \"python3\",\n      \"display_name\": \"Python 3\"\n    },\n    \"accelerator\": \"GPU\"\n  },\n  \"cells\": [\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"AtWchYtU0Dtv\"\n      },\n      \"source\": [\n        \"# IMPORTANT: Make sure you're using a GPU runtime!\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"h_zSu-EKxlBP\"\n      },\n      \"source\": [\n        \"# Based on this notebook: https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Interacting_with_CLIP.ipynb\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"Ur4qmQUIBxwe\"\n      },\n      \"source\": [\n        \"!git clone https://github.com/openai/CLIP\\n\",\n        \"%cd CLIP\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"Rbzq8REAy73u\"\n      },\n      \"source\": [\n        \"# This is SUPER hacky because I don't know a better way (that's quick). Basically the vision model is ready to export as-is, like this:\\n\",\n        \"#   torch.onnx.export(model.vision, ...)\\n\",\n        \"# but the text model has a couple of pre-processing steps (like converting tokens to embeddings), and I'd like to have all that\\n\",\n        \"# processing contained within the onnx file for the text encoder. The `torch.onnx.export` function seems to only be able to\\n\",\n        \"# take a *model* as an input, and not a function (like `model.encode_text`), so I'm hackily renaming `model.encode_text` to\\n\",\n        \"# `model.forward` so that I can then write:\\n\",\n        \"#   torch.onnx.export(model, ...)\\n\",\n        \"# to export the text encoder. I'm sure there's a much better way to do this. If this stops working, note that\\n\",\n        \"# it was working at the following commit hash, so you can clone this to get it working: https://github.com/openai/CLIP/tree/573315e83f07b53a61ff5098757e8fc885f1703e\\n\",\n        \"!sed -i -e 's/def forward(self, image, text):/def old_forward(self, image, text):/g' ./clip/model.py\\n\",\n        \"!sed -i -e 's/def encode_text(self, text):/def forward(self, text):/g' ./clip/model.py\"\n      ],\n      \"execution_count\": 2,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"0BpdJkdBssk9\"\n      },\n      \"source\": [\n        \"! pip install ftfy regex tqdm\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"uLFS29hnhlY4\"\n      },\n      \"source\": [\n        \"import numpy as np\\n\",\n        \"import torch\\n\",\n        \"import clip\\n\",\n        \"\\n\",\n        \"clip.available_models()\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"IBRVTY9lbGm8\"\n      },\n      \"source\": [\n        \"model, preprocess = clip.load(\\\"ViT-B/32\\\")\\n\",\n        \"model.cuda().eval()\\n\",\n        \"input_resolution = model.visual.input_resolution\\n\",\n        \"context_length = model.context_length\\n\",\n        \"vocab_size = model.vocab_size\\n\",\n        \"\\n\",\n        \"print(\\\"Model parameters:\\\", f\\\"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}\\\")\\n\",\n        \"print(\\\"Input resolution:\\\", input_resolution)\\n\",\n        \"print(\\\"Context length:\\\", context_length)\\n\",\n        \"print(\\\"Vocab size:\\\", vocab_size)\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"qGom156-i2kL\"\n      },\n      \"source\": [\n        \"clip.tokenize(\\\"Hello World!\\\")\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"tMc1AXzBlhzm\"\n      },\n      \"source\": [\n        \"import os\\n\",\n        \"import skimage\\n\",\n        \"from PIL import Image\\n\",\n        \"import numpy as np\\n\",\n        \"\\n\",\n        \"from collections import OrderedDict\\n\",\n        \"import torch\\n\",\n        \"\\n\",\n        \"# images in skimage to use and their textual descriptions\\n\",\n        \"descriptions = {\\n\",\n        \"    \\\"astronaut\\\": \\\"a portrait of an astronaut with the American flag\\\",\\n\",\n        \"}\"\n      ],\n      \"execution_count\": 7,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"NSSrLY185jSf\"\n      },\n      \"source\": [\n        \"original_images = []\\n\",\n        \"images = []\\n\",\n        \"texts = []\\n\",\n        \"\\n\",\n        \"for filename in [filename for filename in os.listdir(skimage.data_dir) if filename.endswith(\\\".png\\\") or filename.endswith(\\\".jpg\\\")]:\\n\",\n        \"    name = os.path.splitext(filename)[0]\\n\",\n        \"    if name not in descriptions:\\n\",\n        \"        continue\\n\",\n        \"\\n\",\n        \"    image = Image.open(os.path.join(skimage.data_dir, filename)).convert(\\\"RGB\\\")\\n\",\n        \"    original_images.append(image)\\n\",\n        \"    images.append(preprocess(image))\\n\",\n        \"    texts.append(descriptions[name])\"\n      ],\n      \"execution_count\": 8,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"HBgCanxi8JKw\"\n      },\n      \"source\": [\n        \"image_input = torch.tensor(np.stack(images)).half().cuda()\\n\",\n        \"text_tokens = clip.tokenize([\\\"This is \\\" + desc for desc in texts]).cuda()\"\n      ],\n      \"execution_count\": 9,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"source\": [\n        \"model.visual(image_input)[0] # astronaut pic embedding\"\n      ],\n      \"metadata\": {\n        \"id\": \"g0o8mDN6wq_L\"\n      },\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"source\": [\n        \"model(text_tokens)[0] # astronaut text embedding\"\n      ],\n      \"metadata\": {\n        \"id\": \"qEPHMWwN0Puv\"\n      },\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"kDmmi0vMI9WY\"\n      },\n      \"source\": [\n        \"torch.onnx.export(model, text_tokens, \\\"clip-text-vit-32.onnx\\\", export_params=True, opset_version=12, do_constant_folding=True, input_names = ['input'], output_names = ['output'], dynamic_axes={'input' : {0 : 'batch_size'}, 'output' : {0 : 'batch_size'}})\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"BLSGVjueonP0\"\n      },\n      \"source\": [\n        \"torch.onnx.export(model.visual, image_input, \\\"clip-image-vit-32.onnx\\\", export_params=True, opset_version=12, do_constant_folding=True, input_names = ['input'], output_names = ['output'], dynamic_axes={'input' : {0 : 'batch_size'}, 'output' : {0 : 'batch_size'}})\"\n      ],\n      \"execution_count\": 12,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"X0I6iPCOxB9M\"\n      },\n      \"source\": [\n        \"# use this option in the above torch.onnx.export calls if you get a \\\"Unable to cast from non-held to held instance (T& to Holder<T>)\\\" error:\\n\",\n        \"#   operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"OhCoVnSo2XFr\"\n      },\n      \"source\": [\n        \"# The onnx model files are now in the /content/CLIP directory.\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"source\": [\n        \"# Attempt at quantizing model to uint8 (doesn't seem to work? no errors, but onnx file is same size)\\n\",\n        \"# Reference: https://github.com/minimaxir/imgbeddings/blob/36fb4d7ac6b82694d109cef6f887d4cb9c49da0f/imgbeddings/models.py#L94\\n\",\n        \"# Here's the model the above code generates: https://huggingface.co/minimaxir/imgbeddings/blob/main/patch32_v1.onnx\\n\",\n        \"# Here's a demo of the above ONNX model with ORT Web: https://jsbin.com/nupehazaju/edit?html,output  <-- seems to work, but this model doesn't have the projection head that squashes 768 vec to 512 elements (so can be compared to text embeddings of same length)\\n\",\n        \"!pip install onnxruntime\\n\",\n        \"!pip install onnx\\n\",\n        \"from onnxruntime.quantization import quantize_dynamic, QuantType\\n\",\n        \"quantize_dynamic(\\\"clip-image-vit-32.onnx\\\", \\\"clip-image-vit-32-uint8.onnx\\\", weight_type=QuantType.QUInt8, extra_options={\\\"MatMulConstBOnly\\\":False}) # I added the MatMulConstBOnly as a guess due to warnings that it outputs without it\"\n      ],\n      \"metadata\": {\n        \"id\": \"24LcAdP2doTx\"\n      },\n      \"execution_count\": 22,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"44jzFoZzxPrf\"\n      },\n      \"source\": [\n        \"# The code below is for converting to tflite, tfjs and tf saved model:\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"r2VoXSsyyFu-\"\n      },\n      \"source\": [\n        \"!pip install git+https://github.com/onnx/onnx-tensorflow.git\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"n0axzSah0_h4\"\n      },\n      \"source\": [\n        \"!onnx-tf convert -i clip-image-vit-32.onnx -o clip-image-vit-32-tf\\n\",\n        \"!onnx-tf convert -i clip-text-vit-32.onnx -o clip-text-vit-32-tf\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"2kDc0sPILbQu\"\n      },\n      \"source\": [\n        \"!pip install tensorflowjs\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"WXFWVZACLUR8\"\n      },\n      \"source\": [\n        \"!tensorflowjs_converter --input_format tf_saved_model ./clip-image-vit-32-tf ./clip-image-vit-32-tfjs\\n\",\n        \"!tensorflowjs_converter --input_format tf_saved_model ./clip-text-vit-32-tf ./clip-text-vit-32-tfjs\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"g1Ub_dsaKqO8\"\n      },\n      \"source\": [\n        \"import tensorflow as tf\\n\",\n        \"\\n\",\n        \"# image encoder:\\n\",\n        \"converter = tf.lite.TFLiteConverter.from_saved_model(\\\"./clip-image-vit-32-tf\\\")\\n\",\n        \"converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] # This line is needed because: https://github.com/tensorflow/tfjs/issues/5844\\n\",\n        \"tflite_model = converter.convert()\\n\",\n        \"with open('clip-image-vit-32.tflite', 'wb') as f:\\n\",\n        \"  f.write(tflite_model)\\n\",\n        \"\\n\",\n        \"# text encoder:\\n\",\n        \"converter = tf.lite.TFLiteConverter.from_saved_model(\\\"./clip-text-vit-32-tf\\\")\\n\",\n        \"converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS] # This line is needed because: https://github.com/tensorflow/tfjs/issues/5844\\n\",\n        \"tflite_model = converter.convert()\\n\",\n        \"with open('clip-text-vit-32.tflite', 'wb') as f:\\n\",\n        \"  f.write(tflite_model)\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    }\n  ]\n}\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2021 josephrocca\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "ONNX_float16_to_float32.ipynb",
    "content": "{\n  \"nbformat\": 4,\n  \"nbformat_minor\": 0,\n  \"metadata\": {\n    \"colab\": {\n      \"name\": \"ONNX float16 to float32.ipynb\",\n      \"provenance\": [],\n      \"collapsed_sections\": []\n    },\n    \"kernelspec\": {\n      \"name\": \"python3\",\n      \"display_name\": \"Python 3\"\n    },\n    \"language_info\": {\n      \"name\": \"python\"\n    }\n  },\n  \"cells\": [\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"zDyM0tPjW0KD\"\n      },\n      \"source\": [\n        \"!git clone https://github.com/josephrocca/onnx-typecast # based on: https://github.com/aadhithya/onnx-typecast\\n\",\n        \"%cd onnx-typecast\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"v-sR4H6eXAHM\"\n      },\n      \"source\": [\n        \"!pip install -r requirements.txt\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    },\n    {\n      \"cell_type\": \"code\",\n      \"metadata\": {\n        \"id\": \"7pdZu8W7XITw\"\n      },\n      \"source\": [\n        \"!python3 convert-float16-to-float32.py ./path/to/input.onnx ./path/to/output.onnx\"\n      ],\n      \"execution_count\": null,\n      \"outputs\": []\n    }\n  ]\n}"
  },
  {
    "path": "README.md",
    "content": "**NOTE (Sept 7th 2023)**: At this point you *may* want to use [Transformers.js](https://github.com/xenova/transformers.js) instead since it's well-maintained and supports quantized models which are much smaller. That said, if you don't want to include the whole Transformers.js library in your app (as of writing I'm not sure if [tree-shaking](https://developer.mozilla.org/en-US/docs/Glossary/Tree_shaking) is supported yet), then you can still directly use ONNX Runtime Web with the quantized models produced by the [Transformers.js conversion scripts](https://huggingface.co/docs/transformers.js/custom_usage#convert-your-models-to-onnx).\n\nHere are the relevant modules for Transformers.js:\n\n * https://huggingface.co/docs/transformers.js/api/models#module_models.CLIPTextModelWithProjection\n * https://huggingface.co/docs/transformers.js/api/models#module_models.CLIPVisionModelWithProjection\n\nHere's a full working example that uses Transformers.js:\n```js\nlet quantized = false; // change to `true` for a much smaller model (e.g. 87mb vs 345mb for image model), but lower accuracy\nlet { AutoProcessor, CLIPVisionModelWithProjection, RawImage, AutoTokenizer, CLIPTextModelWithProjection } = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.5.4/dist/transformers.js');\nlet imageProcessor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch16');\nlet visionModel = await CLIPVisionModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16', {quantized});\nlet tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch16');\nlet textModel = await CLIPTextModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch16', {quantized});\n\nfunction cosineSimilarity(A, B) {\n  if(A.length !== B.length) throw new Error(\"A.length !== B.length\");\n  let dotProduct = 0, mA = 0, mB = 0;\n  for(let i = 0; i < A.length; i++){\n    dotProduct += A[i] * B[i];\n    mA += A[i] * A[i];\n    mB += B[i] * B[i];\n  }\n  mA = Math.sqrt(mA);\n  mB = Math.sqrt(mB);\n  let similarity = dotProduct / (mA * mB);\n  return similarity;\n}\n\n// get image embedding:\nlet image = await RawImage.read('https://i.imgur.com/RKsLoNB.png');\nlet imageInputs = await imageProcessor(image);\nlet { image_embeds } = await visionModel(imageInputs);\nconsole.log(image_embeds.data);\n\n// get text embedding:\nlet texts = ['a photo of an astronaut'];\nlet textInputs = tokenizer(texts, { padding: true, truncation: true });\nlet { text_embeds } = await textModel(textInputs);\nconsole.log(text_embeds.data);\n\nlet similarity = cosineSimilarity(image_embeds.data, text_embeds.data);\nconsole.log(similarity);\n```\nNote that the above code uses `clip-vit-base-patch16` instead of what's used in this repo, `clip-vit-base-patch32` - not sure which is best, but you can change `patch16` to `patch32` in the above code if you want to test it. Also note that you'll see some `GET`/`404` errors in the console - that's expected, since Transformers.js tries to load models locally first. There's probably a way to disable this.\n\nTransformers.js also has a **ton** of other models available, and it's quite easy to use. E.g. here's an example of a text embedding / retrieval model:\n```js\nlet { pipeline } = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.5.4/dist/transformers.js');\nlet extractor = await pipeline('feature-extraction', 'Xenova/e5-large-v2');\nlet dotProduct = (vec1, vec2) => vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);\n\nlet passage1 = await extractor('passage: She likes carrots and celery.', { pooling: 'mean', normalize: true });\nlet passage2 = await extractor('passage: This is a good calculus guide.', { pooling: 'mean', normalize: true });\nlet query = await extractor('query: Taking care of rabbits', { pooling: 'mean', normalize: true });\n\nlet similarity1 = dotProduct(query.data, passage1.data);\nlet similarity2 = dotProduct(query.data, passage2.data);\n```\n\n---\n\n# OpenAI CLIP JavaScript\nOpenAI's CLIP model ported to JavaScript using the ONNX web runtime. I also got the LiT models working [here](https://github.com/josephrocca/lit-encoder-js).\n\n**Minimal demos**:\n* Image model: https://josephrocca.github.io/openai-clip-js/onnx-image-demo.html\n* Text model: https://josephrocca.github.io/openai-clip-js/onnx-text-demo.html\n\n**Example applications**:\n* Sorting/searching a local folder of images using a text prompt: https://github.com/josephrocca/clip-image-sorter\n\n**Server side**:\n* Deno: https://github.com/josephrocca/openai-clip-js/blob/main/deno-example.js\n* Node.js: https://github.com/josephrocca/openai-clip-js/blob/main/nodejs-example.js\n\n**Notes:**\n\n* The model files are about **4x** larger than they actually need to be - params are float32 instead of uint8. If you're using CLIP in a \"real\" web app, you should probably quantize it. [@minimaxir](https://github.com/minimaxir) has done it ([1](https://github.com/minimaxir/imgbeddings/blob/36fb4d7ac6b82694d109cef6f887d4cb9c49da0f/imgbeddings/models.py#L94), [2](https://huggingface.co/minimaxir/imgbeddings/blob/main/patch32_v1.onnx)), and that model [worked first try](https://jsbin.com/nupehazaju/edit?html,output) with ORT Web (which is amazing), but it outputs a 768 element vector instead of 512, which I think is because @minimaxir's model is missing the final projection head which puts image embeddings into same-sized space as text embeddings. I had a quick attempt at it in [the ONNX export notebook](https://colab.research.google.com/github/josephrocca/openai-clip-js/blob/main/Export_CLIP_to_ONNX_tflite_tfjs_tf_saved_model.ipynb) (see cell after ONNX conversion), but it doesn't seem to be working. If you investigate this and get it working, please open an issue. Thanks to [@congraIiIso](https://twitter.com/congraIiIso) on Twitter for bringing the uint8 quantization to my attention! **Edit**: I've managed to get quantization \"working\", but the embeddings that the quantized models produce don't seem to be close enough to the correct embeddings. See [this comment](https://github.com/josephrocca/openai-clip-js/issues/3#issuecomment-1221482824) for details.\n* You should use bicubic resizing of images to get the most accurate embeddings. [Here's a simple](https://gist.github.com/josephrocca/d97e0532f34e1205f4006d45ca909024) copy-paste JavaScript bicubic resize + center crop function that uses [wasm-vips](https://github.com/kleisauke/wasm-vips).\n  * More info: In the above-linked image model demo, the image encoder demo uses the default HTML5 canvas resize algorithm when pre-processing the input image. This is apparently not bicubic (which is what OpenAI's CLIP repo uses). This leads to the embeddings being a bit different to what Pytorch gives. I'm not sure if this will end up mattering in practical usage, but in case it matters to you, you should not use canvas resizing, and instead use an actual bicubic resizer. For example, [this astronaut pic](https://i.imgur.com/ec4Ao4s.png) has this embedding with the Pytorch model: `[0.3181,0.3054,-0.1548,0.0767,-0.1699,0.1320,-0.2974,-0.1940,-0.3052,0.2299,0.1995, -0.3025,0.3108,-0.2305,0.2368, ...]` and ONNX Runtime Web (wasm backend) gives: `[0.3635,0.3301,-0.1093,0.0598,-0.1526,0.1127,-0.3373,-0.1544,-0.2627,0.2372,-0.2012,-0.3182,0.3022,-0.2940,0.2227, ...]`. If you pre-resize the image with a bicubic algorithm ([like this](https://i.imgur.com/RKsLoNB.png) - the default image used in the demo), then the embeddings are basically the same.\n* The ONNX text model produces embeddings that seem to be close enough to the Pytorch model based on \"eyeballing\" some image/text matching tasks, but note that there are some non-trivial-looking differences. Again, I don't know whether these differences are enough to significantly affect real-world usage. Please feel free to open an issue if you manage to run some proper tests. Here are the embeddings for \"a portrait of an astronaut with the American flag\" in Pytorch and ONNX:\n  * Pytorch: `[-0.16650, 0.05167, -0.15320, 0.44922, 0.20642, -0.29565, 0.04041, -0.41064, -0.15015, 0.31934, -0.06842, -0.25464, 0.12311, -0.09509, 0.24109, -0.04883, 0.26074, 0.00045, 0.20972, 0.36987, ...]`\n  * ONNX: `[-0.19535, 0.01808, -0.09647, 0.61671, 0.17760, -0.30735, -0.03580, -0.31977, -0.21485, 0.38863, 0.05983, -0.24685, 0.17829, -0.16579, 0.17799, -0.07826, 0.28496, -0.02429, 0.11830, 0.37698, ...]`\n* Models are served to the browser directly from [this HuggingFace 🤗 repo](https://huggingface.co/rocca/openai-clip-js/tree/main).\n* Regarding model conversion:\n  * I used [this Colab notebook](https://colab.research.google.com/github/josephrocca/openai-clip-js/blob/main/Export_CLIP_to_ONNX_tflite_tfjs_tf_saved_model.ipynb) to convert the Pytorch models to ONNX/tfjs/etc.\n  * I used [this Colab notebook](https://colab.research.google.com/github/josephrocca/openai-clip-js/blob/main/ONNX_float16_to_float32.ipynb) to convert weights from float16 to float32 because the ONNX web runtime doesn't currently support float16. This means that the model files are twice as big as they should be ([issue](https://github.com/microsoft/onnxruntime/issues/9758)).\n  * See the comment at the top of [this file](https://github.com/josephrocca/onnx-typecast/blob/master/fix-clip-text-vit-32-float32---scratch.py) for an extra conversion step that needs to be applied to the text model to avoid [this error](https://github.com/microsoft/onnxruntime/issues/9760#issue-1053052192). \n\n\n**Todo (maybe):**\n* Try tfjs runtime if [this issue](https://github.com/tensorflow/tfjs/issues/5847) gets resolved.\n* Try to get tflite model exporting and working.\n"
  },
  {
    "path": "deno-example.js",
    "content": "import { createCanvas, loadImage } from \"https://deno.land/x/canvas@v1.4.1/mod.ts\";\nimport { serve } from \"https://deno.land/std@0.144.0/http/server.ts\";\nimport \"https://cdn.jsdelivr.net/npm/onnxruntime-web@1.15.1/dist/ort.js\";\n\nort.env.wasm.wasmPaths = \"https://cdn.jsdelivr.net/npm/onnxruntime-web@1.15.1/dist/\";\n\nlet onnxImageSession = await ort.InferenceSession.create(\"https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-image-vit-32-float32.onnx\", { executionProviders: [\"wasm\"] });\n\n// let onnxTextSession = await ort.InferenceSession.create(\"https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-text-vit-32-float32-int32.onnx\", { executionProviders: [\"wasm\"] });\n// let Tokenizer = (await import(\"https://deno.land/x/clip_bpe@v0.0.6/mod.js\")).default;\n// let textTokenizer = new Tokenizer();\n\nconsole.log(\"Finished loading CLIP image model.\");\n\nawait serve(async request => {\n  if(!URL.canParse(request.url)) return new Response(\"Invalid URL.\");\n  \n  const urlData = new URL(request.url);\n  const params = Object.fromEntries(urlData.searchParams.entries());\n  const path = urlData.pathname;\n  const ip = request.headers.get('CF-Connecting-IP');\n  \n  if(path === \"/api/image\") {\n    console.log(\"params.imageUrl\", params.imageUrl);\n    let imageUrl = params.imageUrl ?? (await request.json()).imageUrl;\n    let embedding = await embedImage(imageUrl);\n    return new Response(JSON.stringify([...embedding]));\n  }\n\n  return new Response(\"Not found.\", {status:404});\n}, {port: Deno.env.get(\"PORT\")});\n\nasync function embedImage(url) {\n  let rgbData = await getRgbData(url);\n\n  const feeds = {'input': new ort.Tensor('float32', rgbData, [1,3,224,224])};\n\n  let t = Date.now();\n  console.log(\"Running inference...\");\n  const results = await onnxImageSession.run(feeds);\n  console.log(`Finished inference in ${Date.now()-t}ms`);\n\n  const data = results[\"output\"].data;\n  // console.log(`data of result tensor 'output'`, data);\n  return data;\n}\n\n// async function embedText(text) {\n//   let textTokens = textTokenizer.encodeForCLIP(text);\n//   textTokens = Int32Array.from(textTokens);\n//   const feeds = {input: new ort.Tensor('int32', textTokens, [1, 77])};\n//   const results = await onnxTextSession.run(feeds);\n//   return [...results[\"output\"].data];\n// }\n\nasync function getRgbData(imgUrl) {\n  let img = await loadImage(imgUrl);\n  let canvas = createCanvas(224, 224);\n  let ctx = canvas.getContext(\"2d\");\n  ctx.drawImage(img, 0, 0, canvas.width, canvas.height);\n  let imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);\n\n  let rgbData = [[], [], []]; // [r, g, b]\n  // remove alpha and put into correct shape:\n  let d = imageData.data;\n  for(let i = 0; i < d.length; i += 4) { \n    let x = (i/4) % canvas.width;\n    let y = Math.floor((i/4) / canvas.width)\n    if(!rgbData[0][y]) rgbData[0][y] = [];\n    if(!rgbData[1][y]) rgbData[1][y] = [];\n    if(!rgbData[2][y]) rgbData[2][y] = [];\n    rgbData[0][y][x] = d[i+0]/255;\n    rgbData[1][y][x] = d[i+1]/255;\n    rgbData[2][y][x] = d[i+2]/255;\n    // From CLIP repo: Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))\n    rgbData[0][y][x] = (rgbData[0][y][x] - 0.48145466) / 0.26862954;\n    rgbData[1][y][x] = (rgbData[1][y][x] - 0.4578275) / 0.26130258;\n    rgbData[2][y][x] = (rgbData[2][y][x] - 0.40821073) / 0.27577711;\n  }\n  rgbData = Float32Array.from(rgbData.flat().flat());\n  return rgbData;\n}\n"
  },
  {
    "path": "enable-threads.js",
    "content": "// NOTE: This file creates a service worker that cross-origin-isolates the page (read more here: https://web.dev/coop-coep/) which allows us to use wasm threads.\n// Normally you would set the COOP and COEP headers on the server to do this, but Github Pages doesn't allow this, so this is a hack to do that.\n\n/* Edited version of: coi-serviceworker v0.1.6 - Guido Zuidhof, licensed under MIT */\n// From here: https://github.com/gzuidhof/coi-serviceworker\nif(typeof window === 'undefined') {\n  self.addEventListener(\"install\", () => self.skipWaiting());\n  self.addEventListener(\"activate\", e => e.waitUntil(self.clients.claim()));\n\n  async function handleFetch(request) {\n    if(request.cache === \"only-if-cached\" && request.mode !== \"same-origin\") {\n      return;\n    }\n    \n    if(request.mode === \"no-cors\") { // We need to set `credentials` to \"omit\" for no-cors requests, per this comment: https://bugs.chromium.org/p/chromium/issues/detail?id=1309901#c7\n      request = new Request(request.url, {\n        cache: request.cache,\n        credentials: \"omit\",\n        headers: request.headers,\n        integrity: request.integrity,\n        destination: request.destination,\n        keepalive: request.keepalive,\n        method: request.method,\n        mode: request.mode,\n        redirect: request.redirect,\n        referrer: request.referrer,\n        referrerPolicy: request.referrerPolicy,\n        signal: request.signal,\n      });\n    }\n    \n    let r = await fetch(request).catch(e => console.error(e));\n    \n    if(r.status === 0) {\n      return r;\n    }\n\n    const headers = new Headers(r.headers);\n    headers.set(\"Cross-Origin-Embedder-Policy\", \"credentialless\"); // or: require-corp\n    headers.set(\"Cross-Origin-Opener-Policy\", \"same-origin\");\n    \n    return new Response(r.body, { status: r.status, statusText: r.statusText, headers });\n  }\n\n  self.addEventListener(\"fetch\", function(e) {\n    e.respondWith(handleFetch(e.request)); // respondWith must be executed synchonously (but can be passed a Promise)\n  });\n  \n} else {\n  (async function() {\n    if(window.crossOriginIsolated !== false) return;\n\n    let registration = await navigator.serviceWorker.register(window.document.currentScript.src).catch(e => console.error(\"COOP/COEP Service Worker failed to register:\", e));\n    if(registration) {\n      console.log(\"COOP/COEP Service Worker registered\", registration.scope);\n\n      registration.addEventListener(\"updatefound\", () => {\n        console.log(\"Reloading page to make use of updated COOP/COEP Service Worker.\");\n        window.location.reload();\n      });\n\n      // If the registration is active, but it's not controlling the page\n      if(registration.active && !navigator.serviceWorker.controller) {\n        console.log(\"Reloading page to make use of COOP/COEP Service Worker.\");\n        window.location.reload();\n      }\n    }\n  })();\n}\n\n// Code to deregister:\n// let registrations = await navigator.serviceWorker.getRegistrations();\n// for(let registration of registrations) {\n//   await registration.unregister();\n// }\n"
  },
  {
    "path": "nodejs-example.js",
    "content": "// npm install canvas onnxruntime-web\nconst { createCanvas, loadImage } = require('canvas');\nconst ort = require('onnxruntime-web');\n\nort.env.wasm.numThreads = 1; // otherwise for some reason I get \"TypeError [ERR_WORKER_PATH]: The worker script or module filename must be an absolute path\"\n\nlet onnxImageSession;\n\n(async function() {\n  console.log(\"loading clip model...\");\n  onnxImageSession = await ort.InferenceSession.create(\"https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-image-vit-32-float32.onnx\", { executionProviders: [\"wasm\"] });\n  console.log(\"loaded. now running inference...\");\n  await embedImage(\"https://i.imgur.com/RKsLoNB.png\"); // can also pass it a dataURL\n})();\n\nasync function embedImage(url) {\n  let rgbData = await getRgbData(url);\n\n  const feeds = {'input': new ort.Tensor('float32', rgbData, [1,3,224,224])};\n\n  let t = Date.now();\n  console.log(\"Running inference...\");\n  const results = await onnxImageSession.run(feeds);\n  console.log(`Finished inference in ${Date.now()-t}ms`);\n\n  const data = results[\"output\"].data;\n  // console.log(`data of result tensor 'output'`, data);\n  return data;\n}\n\nasync function embedText(text) {\n  let textTokens = textTokenizer.encodeForCLIP(text);\n  textTokens = Int32Array.from(textTokens);\n  const feeds = {input: new ort.Tensor('int32', textTokens, [1, 77])};\n  const results = await onnxTextSession.run(feeds);\n  return [...results[\"output\"].data];\n}\n\nasync function getRgbData(imgUrl) {\n  let img = await loadImage(imgUrl);\n  let canvas = createCanvas(224, 224);\n  let ctx = canvas.getContext(\"2d\");\n  ctx.drawImage(img, 0, 0, canvas.width, canvas.height);\n  let imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);\n\n  let rgbData = [[], [], []]; // [r, g, b]\n  // remove alpha and put into correct shape:\n  let d = imageData.data;\n  for(let i = 0; i < d.length; i += 4) { \n    let x = (i/4) % canvas.width;\n    let y = Math.floor((i/4) / canvas.width)\n    if(!rgbData[0][y]) rgbData[0][y] = [];\n    if(!rgbData[1][y]) rgbData[1][y] = [];\n    if(!rgbData[2][y]) rgbData[2][y] = [];\n    rgbData[0][y][x] = d[i+0]/255;\n    rgbData[1][y][x] = d[i+1]/255;\n    rgbData[2][y][x] = d[i+2]/255;\n    // From CLIP repo: Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))\n    rgbData[0][y][x] = (rgbData[0][y][x] - 0.48145466) / 0.26862954;\n    rgbData[1][y][x] = (rgbData[1][y][x] - 0.4578275) / 0.26130258;\n    rgbData[2][y][x] = (rgbData[2][y][x] - 0.40821073) / 0.27577711;\n  }\n  rgbData = Float32Array.from(rgbData.flat().flat());\n  return rgbData;\n}\n"
  },
  {
    "path": "onnx-image-demo.html",
    "content": "<!DOCTYPE html>\n<html>\n  <head>\n    <title>OpenAI CLIP JavaScript - Image Demo - ONNX Web Runtime</title>\n    <script src=\"enable-threads.js\"></script>\n  </head>\n  <body>\n    <script src=\"https://cdn.jsdelivr.net/npm/onnxruntime-web@1.15.1/dist/ort.js\"></script>\n    \n    <div>\n      imgur.com url (ideally 224x224): <input id=\"imgUrlInput\" value=\"https://i.imgur.com/RKsLoNB.png\">\n      <!-- karpathy: https://i.imgur.com/WEIKDpX.jpg -->\n      <!-- 512px astronaut: https://i.imgur.com/ec4Ao4s.png -->\n      <br>\n      backend: <select id=\"backendSelectEl\">\n        <option>wasm</option>\n        <option>webgl (doesn't work yet)</option>\n      </select>\n      <br>\n      quantized: <select id=\"quantizedSelectEl\">\n        <option value=\"no\">no</option>\n        <option value=\"yes\">yes (4x smaller model, but currently the embeddings are inaccurate - see readme)</option>\n      </select>\n      <br>\n      <button id=\"startBtn\" onclick=\"main()\">start</button>\n    </div>\n    <p><a href=\"https://github.com/josephrocca/openai-clip-js\">github repo</a> - <a href=\"https://huggingface.co/rocca/openai-clip-js/tree/main\">huggingface repo</a></p>\n    \n    <script>\n      if(self.crossOriginIsolated) { // needs to be cross-origin-isolated to use wasm threads. you need to serve this html file with these two headers: https://web.dev/coop-coep/\n        ort.env.wasm.numThreads = navigator.hardwareConcurrency\n      }\n      \n      async function main() {\n        startBtn.disabled = true;\n        startBtn.innerHTML = \"see console\";\n        \n        console.log(\"Downloading model... (see network tab for progress)\");\n        // let modelPath = backendSelectEl.value === \"webgl\" ? './clip-image-vit-32-int32-float32.onnx' : './clip-image-vit-32-float32.onnx';\n        let modelPath = quantizedSelectEl.value === \"no\" ? 'https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-image-vit-32-float32.onnx' : 'https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-image-vit-32-uint8.onnx';\n        const session = await ort.InferenceSession.create(modelPath, { executionProviders: [backendSelectEl.value] });\n        console.log(\"Model loaded.\");\n\n        // for console debugging:\n        window.session = session;\n\n        let rgbData = await getRgbData(imgUrlInput.value);\n\n        const feeds = {'input': new ort.Tensor('float32', rgbData, [1,3,224,224])};\n\n        console.log(\"Running inference...\");\n        const results = await session.run(feeds);\n        console.log(\"Finished inference.\");\n\n        const data = results[\"output\"].data;\n        console.log(`data of result tensor 'output'`, data);\n      }\n      \n      async function getRgbData(imgUrl, mode=\"resizeAndCenterCrop\") {\n        let blob = await fetch(imgUrl, {referrer:\"\"}).then(r => r.blob());\n        let img = await createImageBitmap(blob);\n\n        let canvas;\n        if(window.document) {\n          canvas = document.createElement(\"canvas\");\n          canvas.width = 224;\n          canvas.height = 224;\n        } else {\n          new OffscreenCanvas(224, 224);\n        }\n\n        let ctx = canvas.getContext(\"2d\");\n\n        if(mode === \"resizeAndCenterCrop\") {\n          // scale img to fit the shorter side to the canvas size\n          let scale = Math.max(canvas.width / img.width, canvas.height / img.height);\n          \n          // compute new image dimensions that would maintain the original aspect ratio\n          let scaledW = img.width * scale;\n          let scaledH = img.height * scale;\n          \n          // compute position to center the image\n          let posX = (canvas.width - scaledW) / 2;\n          let posY = (canvas.height - scaledH) / 2;\n          \n          // draw the image centered and scaled on the canvas\n          ctx.drawImage(img, posX, posY, scaledW, scaledH);\n        } else if(mode === \"squash\") {\n          ctx.drawImage(img, 0, 0, canvas.width, canvas.height);\n        }\n\n        if(window.document) document.body.appendChild(canvas); // can be removed - just to visualize the crop\n        \n        let imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);\n\n        let rgbData = [[], [], []]; // [r, g, b]\n        // remove alpha and put into correct shape:\n        let d = imageData.data;\n        for(let i = 0; i < d.length; i += 4) { \n          let x = (i/4) % canvas.width;\n          let y = Math.floor((i/4) / canvas.width)\n          if(!rgbData[0][y]) rgbData[0][y] = [];\n          if(!rgbData[1][y]) rgbData[1][y] = [];\n          if(!rgbData[2][y]) rgbData[2][y] = [];\n          rgbData[0][y][x] = d[i+0]/255;\n          rgbData[1][y][x] = d[i+1]/255;\n          rgbData[2][y][x] = d[i+2]/255;\n          // From CLIP repo: Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))\n          rgbData[0][y][x] = (rgbData[0][y][x] - 0.48145466) / 0.26862954;\n          rgbData[1][y][x] = (rgbData[1][y][x] - 0.4578275) / 0.26130258;\n          rgbData[2][y][x] = (rgbData[2][y][x] - 0.40821073) / 0.27577711;\n        }\n        rgbData = Float32Array.from(rgbData.flat().flat());\n        return rgbData;\n      }\n    </script>\n  </body>\n</html>\n"
  },
  {
    "path": "onnx-text-demo.html",
    "content": "<!DOCTYPE html>\n<html>\n  <head>\n    <title>OpenAI CLIP JavaScript - Text Demo - ONNX Web Runtime</title>\n    <script src=\"enable-threads.js\"></script>\n  </head>\n  <body>\n    <script src=\"https://cdn.jsdelivr.net/npm/onnxruntime-web@1.15.1/dist/ort.js\"></script>\n    \n    <div>\n      input text <input id=\"textInputEl\" value=\"This is a portrait of an astronaut with the American flag\">\n      <br>\n      quantized: <select id=\"quantizedSelectEl\">\n        <option value=\"no\">no</option>\n        <option value=\"yes\">yes (4x smaller model, but currently the embeddings are inaccurate - see readme)</option>\n      </select>\n      <br>\n      <button id=\"startBtn\" onclick=\"main()\">start</button>\n    </div>\n    <p><a href=\"https://github.com/josephrocca/openai-clip-js\">github repo</a></p>\n    \n    <script>\n      if(self.crossOriginIsolated) { // needs to be cross-origin-isolated to use wasm threads. you need to add these two headers: https://web.dev/coop-coep/\n        ort.env.wasm.numThreads = navigator.hardwareConcurrency\n      }\n      \n      async function main() {\n        startBtn.disabled = true;\n        startBtn.innerHTML = \"see console\";\n        \n        console.log(\"Loading model... (see network tab for progress)\");\n        let modelPath = quantizedSelectEl.value === \"no\" ? 'https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-text-vit-32-float32-int32.onnx' : 'https://huggingface.co/rocca/openai-clip-js/resolve/main/clip-text-vit-32-uint8.onnx';\n        const session = await ort.InferenceSession.create(modelPath, { executionProviders: [\"wasm\"] });\n        console.log(\"Model loaded.\");\n        \n        let Tokenizer = (await import(\"https://deno.land/x/clip_bpe@v0.0.6/mod.js\")).default;\n        let t = new Tokenizer();\n        let textTokens = t.encodeForCLIP(textInputEl.value);\n        textTokens = Int32Array.from(textTokens);\n        const feeds = {'input': new ort.Tensor('int32', textTokens, [1, 77])};\n\n        console.log(\"Running inference...\");\n        const results = await session.run(feeds);\n        console.log(\"Finished inference.\");\n\n        const data = results[\"output\"].data;\n        console.log(`data of result tensor 'output'`, data);\n      }\n    </script>\n  </body>\n</html>\n"
  },
  {
    "path": "tfjs-text-demo.html",
    "content": "<!DOCTYPE html>\n<html>\n  <head>\n    <title>OpenAI CLIP JavaScript - Text Demo - tfjs</title>\n  </head>\n  <body>\n    <script src=\"https://cdn.jsdelivr.net/npm/@tensorflow/tfjs@3.11.0/dist/tf.js\"></script>\n    <script src=\"https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-backend-wasm@3.11.0/dist/tf-backend-wasm.js\"></script>\n    \n    <h3>Note: To run this you need to clone <a href=\"https://github.com/josephrocca/openai-clip-js\">this repo</a>, and then download the tfjs model folder from <a href=\"https://drive.google.com/drive/folders/1-GI6-OTDiJcjYKTavoobbubc9BYjQDzW?usp=sharing\">here</a> and name the folder \"clip-text-vit-32-tfjs\", and then run a static file server in the repo directory.</h3>\n    \n    <div>\n      input text <input id=\"textInputEl\" value=\"hello world!\">\n      backend: <select id=\"backendSelectEl\">\n        <option>wasm</option>\n        <option>webgl</option>\n      </select>\n      <button id=\"startBtn\" onclick=\"main()\">start</button>\n    </div>\n    <p><a href=\"https://github.com/josephrocca/openai-clip-js\">github repo</a> - <a href=\"https://huggingface.co/rocca/openai-clip-js/tree/main\">huggingface repo</a></p>\n    \n    <script>\n      async function main() {\n        startBtn.disabled = true;\n        startBtn.innerHTML = \"see console\";\n\n        await tf.setBackend(backendSelectEl.value);\n        \n        console.log(\"Loading model... (see network tab for progress)\");\n        let modelPath = './clip-text-vit-32-tfjs/model.json';\n        let model = await tf.loadGraphModel(modelPath);\n        console.log(\"Model loaded.\");\n        \n        let Tokenizer = (await import(\"https://deno.land/x/clip_bpe@v0.0.6/mod.js\")).default;\n        let t = new Tokenizer();\n        let textTokens = t.encodeForCLIP(textInputEl.value);\n        textTokens = Float32Array.from(textTokens);\n        let input = {'input': tf.tensor(textTokens, [1, 77], \"float32\")};\n\n        console.log(\"Running inference...\");\n        const results = await model.execute(input, [\"output\"]);\n        debugger;\n        console.log(\"Finished inference.\");\n\n        const data = results[\"output\"].data;\n        console.log(`data of result tensor 'output'`, data);\n      }\n\n    </script>\n  </body>\n</html>\n"
  }
]